194 files changed, 39861 insertions, 4851 deletions
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index fd01d90cada5..57997fa14e69 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -51,4 +51,4 @@ int v9fs_dir_release(struct inode *inode, struct file *filp);
 int v9fs_file_open(struct inode *inode, struct file *file);
 void v9fs_inode2stat(struct inode *inode, struct p9_stat *stat);
 void v9fs_dentry_release(struct dentry *);
-int v9fs_uflags2omode(int uflags);
+int v9fs_uflags2omode(int uflags, int extended);
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 0d55affe37d4..52944d2249a4 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -59,7 +59,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
        P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p \n", inode, file);
        v9ses = v9fs_inode2v9ses(inode);
-        omode = v9fs_uflags2omode(file->f_flags);
+        omode = v9fs_uflags2omode(file->f_flags, v9fs_extended(v9ses));
        fid = file->private_data;
        if (!fid) {
                fid = v9fs_fid_clone(file->f_path.dentry);
@@ -75,6 +75,8 @@ int v9fs_file_open(struct inode *inode, struct file *file)
                        inode->i_size = 0;
                        inode->i_blocks = 0;
                }
+                if ((file->f_flags & O_APPEND) && (!v9fs_extended(v9ses)))
+                        generic_file_llseek(file, 0, SEEK_END);
        }
        file->private_data = fid;
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 40fa807bd929..c95295c65045 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -132,10 +132,10 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode)
 /**
 * v9fs_uflags2omode- convert posix open flags to plan 9 mode bits
 * @uflags: flags to convert
- *
+ * @extended: if .u extensions are active
 */
-int v9fs_uflags2omode(int uflags)
+int v9fs_uflags2omode(int uflags, int extended)
 {
        int ret;
@@ -155,14 +155,16 @@ int v9fs_uflags2omode(int uflags)
                break;
        }
-        if (uflags & O_EXCL)
-                ret |= P9_OEXCL;
        if (uflags & O_TRUNC)
                ret |= P9_OTRUNC;
-        if (uflags & O_APPEND)
+        if (extended) {
-                ret |= P9_OAPPEND;
+                if (uflags & O_EXCL)
+                        ret |= P9_OEXCL;
+                if (uflags & O_APPEND)
+                        ret |= P9_OAPPEND;
+        }
        return ret;
 }
@@ -506,7 +508,7 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
                flags = O_RDWR;
        fid = v9fs_create(v9ses, dir, dentry, NULL, perm,
-                                                v9fs_uflags2omode(flags));
+                                v9fs_uflags2omode(flags, v9fs_extended(v9ses)));
        if (IS_ERR(fid)) {
                err = PTR_ERR(fid);
                fid = NULL;
diff --git a/fs/Kconfig b/fs/Kconfig
index cf12c403b8c7..17216ba99c85 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -830,7 +830,7 @@ config NTFS_FS
          from the project web site.
          For more information see <file:Documentation/filesystems/ntfs.txt>
-          and <http://linux-ntfs.sourceforge.net/>.
+          and <http://www.linux-ntfs.org/>.
          To compile this file system support as a module, choose M here: the
          module will be called ntfs.
@@ -930,7 +930,7 @@ config PROC_KCORE
 config PROC_VMCORE
        bool "/proc/vmcore support (EXPERIMENTAL)"
-        depends on PROC_FS && EXPERIMENTAL && CRASH_DUMP
+        depends on PROC_FS && CRASH_DUMP
        default y
        help
        Exports the dump image of crashed kernel in ELF format.
@@ -1375,6 +1375,9 @@ config JFFS2_CMODE_FAVOURLZO
 endchoice
+# UBIFS File system configuration
+source "fs/ubifs/Kconfig"
 config CRAMFS
        tristate "Compressed ROM file system support (cramfs)"
        depends on BLOCK
@@ -1544,10 +1547,6 @@ config UFS_FS
          The recently released UFS2 variant (used in FreeBSD 5.x) is
          READ-ONLY supported.
-          If you only intend to mount files from some other Unix over the
-          network using NFS, you don't need the UFS file system support (but
-          you need NFS file system support obviously).
          Note that this option is generally not needed for floppies, since a
          good portable way to transport files and directories between unixes
          (and even other operating systems) is given by the tar program ("man
@@ -1587,6 +1586,7 @@ menuconfig NETWORK_FILESYSTEMS
          Say Y here to get to see options for network filesystems and
          filesystem-related networking code, such as NFS daemon and
          RPCSEC security modules.
          This option alone does not add any kernel code.
          If you say N, all options in this submenu will be skipped and
@@ -1595,76 +1595,92 @@ menuconfig NETWORK_FILESYSTEMS
 if NETWORK_FILESYSTEMS
 config NFS_FS
-        tristate "NFS file system support"
+        tristate "NFS client support"
        depends on INET
        select LOCKD
        select SUNRPC
        select NFS_ACL_SUPPORT if NFS_V3_ACL
        help
-          If you are connected to some other (usually local) Unix computer
+          Choose Y here if you want to access files residing on other
-          (using SLIP, PLIP, PPP or Ethernet) and want to mount files residing
+          computers using Sun's Network File System protocol.  To compile
-          on that computer (the NFS server) using the Network File Sharing
+          this file system support as a module, choose M here: the module
-          protocol, say Y. "Mounting files" means that the client can access
+          will be called nfs.
-          the files with usual UNIX commands as if they were sitting on the
-          client's hard disk. For this to work, the server must run the
-          programs nfsd and mountd (but does not need to have NFS file system
-          support enabled in its kernel). NFS is explained in the Network
-          Administrator's Guide, available from
-          <http://www.tldp.org/docs.html#guide>, on its man page: "man
-          nfs", and in the NFS-HOWTO.
-          A superior but less widely used alternative to NFS is provided by
+          To mount file systems exported by NFS servers, you also need to
-          the Coda file system; see "Coda file system support" below.
+          install the user space mount.nfs command which can be found in
+          the Linux nfs-utils package, available from http://linux-nfs.org/.
+          Information about using the mount command is available in the
+          mount(8) man page.  More detail about the Linux NFS client
+          implementation is available via the nfs(5) man page.
-          If you say Y here, you should have said Y to TCP/IP networking also.
+          Below you can choose which versions of the NFS protocol are
-          This option would enlarge your kernel by about 27 KB.
+          available in the kernel to mount NFS servers.  Support for NFS
+          version 2 (RFC 1094) is always available when NFS_FS is selected.
-          To compile this file system support as a module, choose M here: the
-          module will be called nfs.
-          If you are configuring a diskless machine which will mount its root
+          To configure a system which mounts its root file system via NFS
-          file system over NFS at boot time, say Y here and to "Kernel
+          at boot time, say Y here, select "Kernel level IP
-          level IP autoconfiguration" above and to "Root file system on NFS"
+          autoconfiguration" in the NETWORK menu, and select "Root file
-          below. You cannot compile this driver as a module in this case.
+          system on NFS" below.  You cannot compile this file system as a
-          There are two packages designed for booting diskless machines over
+          module in this case.
-          the net: netboot, available from
-          <http://ftp1.sourceforge.net/netboot/>, and Etherboot,
-          available from <http://ftp1.sourceforge.net/etherboot/>.
-          If you don't know what all this is about, say N.
+          If unsure, say N.
 config NFS_V3
-        bool "Provide NFSv3 client support"
+        bool "NFS client support for NFS version 3"
        depends on NFS_FS
        help
-          Say Y here if you want your NFS client to be able to speak version
+          This option enables support for version 3 of the NFS protocol
-          3 of the NFS protocol.
+          (RFC 1813) in the kernel's NFS client.
          If unsure, say Y.
 config NFS_V3_ACL
-        bool "Provide client support for the NFSv3 ACL protocol extension"
+        bool "NFS client support for the NFSv3 ACL protocol extension"
        depends on NFS_V3
        help
-          Implement the NFSv3 ACL protocol extension for manipulating POSIX
+          Some NFS servers support an auxiliary NFSv3 ACL protocol that
-          Access Control Lists.  The server should also be compiled with
+          Sun added to Solaris but never became an official part of the
-          the NFSv3 ACL protocol extension; see the CONFIG_NFSD_V3_ACL option.
+          NFS version 3 protocol.  This protocol extension allows
+          applications on NFS clients to manipulate POSIX Access Control
+          Lists on files residing on NFS servers.  NFS servers enforce
+          ACLs on local files whether this protocol is available or not.
+          Choose Y here if your NFS server supports the Solaris NFSv3 ACL
+          protocol extension and you want your NFS client to allow
+          applications to access and modify ACLs on files on the server.
+          Most NFS servers don't support the Solaris NFSv3 ACL protocol
+          extension.  You can choose N here or specify the "noacl" mount
+          option to prevent your NFS client from trying to use the NFSv3
+          ACL protocol.
          If unsure, say N.
 config NFS_V4
-        bool "Provide NFSv4 client support (EXPERIMENTAL)"
+        bool "NFS client support for NFS version 4 (EXPERIMENTAL)"
        depends on NFS_FS && EXPERIMENTAL
        select RPCSEC_GSS_KRB5
        help
-          Say Y here if you want your NFS client to be able to speak the newer
+          This option enables support for version 4 of the NFS protocol
-          version 4 of the NFS protocol.
+          (RFC 3530) in the kernel's NFS client.
-          Note: Requires auxiliary userspace daemons which may be found on
+          To mount NFS servers using NFSv4, you also need to install user
-                http://www.citi.umich.edu/projects/nfsv4/
+          space programs which can be found in the Linux nfs-utils package,
+          available from http://linux-nfs.org/.
          If unsure, say N.
+config ROOT_NFS
+        bool "Root file system on NFS"
+        depends on NFS_FS=y && IP_PNP
+        help
+          If you want your system to mount its root file system via NFS,
+          choose Y here.  This is common practice for managing systems
+          without local permanent storage.  For details, read
+          <file:Documentation/filesystems/nfsroot.txt>.
+          Most people say N here.
 config NFSD
        tristate "NFS server support"
        depends on INET
@@ -1746,20 +1762,6 @@ config NFSD_V4
          If unsure, say N.
-config ROOT_NFS
-        bool "Root file system on NFS"
-        depends on NFS_FS=y && IP_PNP
-        help
-          If you want your Linux box to mount its whole root file system (the
-          one containing the directory /) from some other computer over the
-          net via NFS (presumably because your box doesn't have a hard disk),
-          say Y. Read <file:Documentation/filesystems/nfsroot.txt> for
-          details. It is likely that in this case, you also want to say Y to
-          "Kernel level IP autoconfiguration" so that your box can discover
-          its network address at boot time.
-          Most people say N here.
 config LOCKD
        tristate
@@ -1800,27 +1802,6 @@ config SUNRPC_XPRT_RDMA
          If unsure, say N.
-config SUNRPC_BIND34
-        bool "Support for rpcbind versions 3 & 4 (EXPERIMENTAL)"
-        depends on SUNRPC && EXPERIMENTAL
-        default n
-        help
-          RPC requests over IPv6 networks require support for larger
-          addresses when performing an RPC bind.  Sun added support for
-          IPv6 addressing by creating two new versions of the rpcbind
-          protocol (RFC 1833).
-          This option enables support in the kernel RPC client for
-          querying rpcbind servers via versions 3 and 4 of the rpcbind
-          protocol.  The kernel automatically falls back to version 2
-          if a remote rpcbind service does not support versions 3 or 4.
-          By themselves, these new versions do not provide support for
-          RPC over IPv6, but the new protocol versions are necessary to
-          support it.
-          If unsure, say N to get traditional behavior (version 2 rpcbind
-          requests only).
 config RPCSEC_GSS_KRB5
        tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)"
        depends on SUNRPC && EXPERIMENTAL
diff --git a/fs/Makefile b/fs/Makefile
index 1e7a11bd4da1..3b2178b4bb66 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -19,6 +19,7 @@ else
 obj-y +=        no-block.o
 endif
+obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o
 obj-$(CONFIG_INOTIFY)           += inotify.o
 obj-$(CONFIG_INOTIFY_USER)      += inotify_user.o
 obj-$(CONFIG_EPOLL)             += eventpoll.o
@@ -100,6 +101,7 @@ obj-$(CONFIG_NTFS_FS)		+= ntfs/
 obj-$(CONFIG_UFS_FS)            += ufs/
 obj-$(CONFIG_EFS_FS)            += efs/
 obj-$(CONFIG_JFFS2_FS)          += jffs2/
+obj-$(CONFIG_UBIFS_FS)          += ubifs/
 obj-$(CONFIG_AFFS_FS)           += affs/
 obj-$(CONFIG_ROMFS_FS)          += romfs/
 obj-$(CONFIG_QNX4FS_FS)         += qnx4/
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 0fa95b198e6e..d48ff5f370f4 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -16,7 +16,6 @@
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
-#include <linux/a.out.h>
 #include <linux/errno.h>
 #include <linux/signal.h>
 #include <linux/binfmts.h>
@@ -548,7 +547,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
        struct {
                struct elfhdr elf_ex;
                struct elfhdr interp_elf_ex;
-                struct exec interp_ex;
        } *loc;
        loc = kmalloc(sizeof(*loc), GFP_KERNEL);
@@ -680,7 +678,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
                        }
                        /* Get the exec headers */
-                        loc->interp_ex = *((struct exec *)bprm->buf);
                        loc->interp_elf_ex = *((struct elfhdr *)bprm->buf);
                        break;
                }
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
new file mode 100644
index 000000000000..63e2ee63058d
--- /dev/null
+++ b/fs/bio-integrity.c
@@ -0,0 +1,719 @@
+/*
+ * bio-integrity.c - bio data integrity extensions
+ *
+ * Copyright (C) 2007, 2008 Oracle Corporation
+ * Written by: Martin K. Petersen <martin.petersen@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
+ * USA.
+ *
+ */
+#include <linux/blkdev.h>
+#include <linux/mempool.h>
+#include <linux/bio.h>
+#include <linux/workqueue.h>
+static struct kmem_cache *bio_integrity_slab __read_mostly;
+static struct workqueue_struct *kintegrityd_wq;
+/**
+ * bio_integrity_alloc_bioset - Allocate integrity payload and attach it to bio
+ * @bio:        bio to attach integrity metadata to
+ * @gfp_mask:   Memory allocation mask
+ * @nr_vecs:    Number of integrity metadata scatter-gather elements
+ * @bs:         bio_set to allocate from
+ *
+ * Description: This function prepares a bio for attaching integrity
+ * metadata.  nr_vecs specifies the maximum number of pages containing
+ * integrity metadata that can be attached.
+ */
+struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
+                                                         gfp_t gfp_mask,
+                                                         unsigned int nr_vecs,
+                                                         struct bio_set *bs)
+{
+        struct bio_integrity_payload *bip;
+        struct bio_vec *iv;
+        unsigned long idx;
+        BUG_ON(bio == NULL);
+        bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
+        if (unlikely(bip == NULL)) {
+                printk(KERN_ERR "%s: could not alloc bip\n", __func__);
+                return NULL;
+        }
+        memset(bip, 0, sizeof(*bip));
+        iv = bvec_alloc_bs(gfp_mask, nr_vecs, &idx, bs);
+        if (unlikely(iv == NULL)) {
+                printk(KERN_ERR "%s: could not alloc bip_vec\n", __func__);
+                mempool_free(bip, bs->bio_integrity_pool);
+                return NULL;
+        }
+        bip->bip_pool = idx;
+        bip->bip_vec = iv;
+        bip->bip_bio = bio;
+        bio->bi_integrity = bip;
+        return bip;
+}
+EXPORT_SYMBOL(bio_integrity_alloc_bioset);
+/**
+ * bio_integrity_alloc - Allocate integrity payload and attach it to bio
+ * @bio:        bio to attach integrity metadata to
+ * @gfp_mask:   Memory allocation mask
+ * @nr_vecs:    Number of integrity metadata scatter-gather elements
+ *
+ * Description: This function prepares a bio for attaching integrity
+ * metadata.  nr_vecs specifies the maximum number of pages containing
+ * integrity metadata that can be attached.
+ */
+struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
+                                                  gfp_t gfp_mask,
+                                                  unsigned int nr_vecs)
+{
+        return bio_integrity_alloc_bioset(bio, gfp_mask, nr_vecs, fs_bio_set);
+}
+EXPORT_SYMBOL(bio_integrity_alloc);
+/**
+ * bio_integrity_free - Free bio integrity payload
+ * @bio:        bio containing bip to be freed
+ * @bs:         bio_set this bio was allocated from
+ *
+ * Description: Used to free the integrity portion of a bio. Usually
+ * called from bio_free().
+ */
+void bio_integrity_free(struct bio *bio, struct bio_set *bs)
+{
+        struct bio_integrity_payload *bip = bio->bi_integrity;
+        BUG_ON(bip == NULL);
+        /* A cloned bio doesn't own the integrity metadata */
+        if (!bio_flagged(bio, BIO_CLONED) && bip->bip_buf != NULL)
+                kfree(bip->bip_buf);
+        mempool_free(bip->bip_vec, bs->bvec_pools[bip->bip_pool]);
+        mempool_free(bip, bs->bio_integrity_pool);
+        bio->bi_integrity = NULL;
+}
+EXPORT_SYMBOL(bio_integrity_free);
+/**
+ * bio_integrity_add_page - Attach integrity metadata
+ * @bio:        bio to update
+ * @page:       page containing integrity metadata
+ * @len:        number of bytes of integrity metadata in page
+ * @offset:     start offset within page
+ *
+ * Description: Attach a page containing integrity metadata to bio.
+ */
+int bio_integrity_add_page(struct bio *bio, struct page *page,
+                           unsigned int len, unsigned int offset)
+{
+        struct bio_integrity_payload *bip = bio->bi_integrity;
+        struct bio_vec *iv;
+        if (bip->bip_vcnt >= bvec_nr_vecs(bip->bip_pool)) {
+                printk(KERN_ERR "%s: bip_vec full\n", __func__);
+                return 0;
+        }
+        iv = bip_vec_idx(bip, bip->bip_vcnt);
+        BUG_ON(iv == NULL);
+        BUG_ON(iv->bv_page != NULL);
+        iv->bv_page = page;
+        iv->bv_len = len;
+        iv->bv_offset = offset;
+        bip->bip_vcnt++;
+        return len;
+}
+EXPORT_SYMBOL(bio_integrity_add_page);
+/**
+ * bio_integrity_enabled - Check whether integrity can be passed
+ * @bio:        bio to check
+ *
+ * Description: Determines whether bio_integrity_prep() can be called
+ * on this bio or not.  bio data direction and target device must be
+ * set prior to calling.  The functions honors the write_generate and
+ * read_verify flags in sysfs.
+ */
+int bio_integrity_enabled(struct bio *bio)
+{
+        /* Already protected? */
+        if (bio_integrity(bio))
+                return 0;
+        return bdev_integrity_enabled(bio->bi_bdev, bio_data_dir(bio));
+}
+EXPORT_SYMBOL(bio_integrity_enabled);
+/**
+ * bio_integrity_hw_sectors - Convert 512b sectors to hardware ditto
+ * @bi:         blk_integrity profile for device
+ * @sectors:    Number of 512 sectors to convert
+ *
+ * Description: The block layer calculates everything in 512 byte
+ * sectors but integrity metadata is done in terms of the hardware
+ * sector size of the storage device.  Convert the block layer sectors
+ * to physical sectors.
+ */
+static inline unsigned int bio_integrity_hw_sectors(struct blk_integrity *bi,
+                                                    unsigned int sectors)
+{
+        /* At this point there are only 512b or 4096b DIF/EPP devices */
+        if (bi->sector_size == 4096)
+                return sectors >>= 3;
+        return sectors;
+}
+/**
+ * bio_integrity_tag_size - Retrieve integrity tag space
+ * @bio:        bio to inspect
+ *
+ * Description: Returns the maximum number of tag bytes that can be
+ * attached to this bio. Filesystems can use this to determine how
+ * much metadata to attach to an I/O.
+ */
+unsigned int bio_integrity_tag_size(struct bio *bio)
+{
+        struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+        BUG_ON(bio->bi_size == 0);
+        return bi->tag_size * (bio->bi_size / bi->sector_size);
+}
+EXPORT_SYMBOL(bio_integrity_tag_size);
+int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, int set)
+{
+        struct bio_integrity_payload *bip = bio->bi_integrity;
+        struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+        unsigned int nr_sectors;
+        BUG_ON(bip->bip_buf == NULL);
+        if (bi->tag_size == 0)
+                return -1;
+        nr_sectors = bio_integrity_hw_sectors(bi,
+                                        DIV_ROUND_UP(len, bi->tag_size));
+        if (nr_sectors * bi->tuple_size > bip->bip_size) {
+                printk(KERN_ERR "%s: tag too big for bio: %u > %u\n",
+                       __func__, nr_sectors * bi->tuple_size, bip->bip_size);
+                return -1;
+        }
+        if (set)
+                bi->set_tag_fn(bip->bip_buf, tag_buf, nr_sectors);
+        else
+                bi->get_tag_fn(bip->bip_buf, tag_buf, nr_sectors);
+        return 0;
+}
+/**
+ * bio_integrity_set_tag - Attach a tag buffer to a bio
+ * @bio:        bio to attach buffer to
+ * @tag_buf:    Pointer to a buffer containing tag data
+ * @len:        Length of the included buffer
+ *
+ * Description: Use this function to tag a bio by leveraging the extra
+ * space provided by devices formatted with integrity protection.  The
+ * size of the integrity buffer must be <= to the size reported by
+ * bio_integrity_tag_size().
+ */
+int bio_integrity_set_tag(struct bio *bio, void *tag_buf, unsigned int len)
+{
+        BUG_ON(bio_data_dir(bio) != WRITE);
+        return bio_integrity_tag(bio, tag_buf, len, 1);
+}
+EXPORT_SYMBOL(bio_integrity_set_tag);
+/**
+ * bio_integrity_get_tag - Retrieve a tag buffer from a bio
+ * @bio:        bio to retrieve buffer from
+ * @tag_buf:    Pointer to a buffer for the tag data
+ * @len:        Length of the target buffer
+ *
+ * Description: Use this function to retrieve the tag buffer from a
+ * completed I/O. The size of the integrity buffer must be <= to the
+ * size reported by bio_integrity_tag_size().
+ */
+int bio_integrity_get_tag(struct bio *bio, void *tag_buf, unsigned int len)
+{
+        BUG_ON(bio_data_dir(bio) != READ);
+        return bio_integrity_tag(bio, tag_buf, len, 0);
+}
+EXPORT_SYMBOL(bio_integrity_get_tag);
+/**
+ * bio_integrity_generate - Generate integrity metadata for a bio
+ * @bio:        bio to generate integrity metadata for
+ *
+ * Description: Generates integrity metadata for a bio by calling the
+ * block device's generation callback function.  The bio must have a
+ * bip attached with enough room to accommodate the generated
+ * integrity metadata.
+ */
+static void bio_integrity_generate(struct bio *bio)
+{
+        struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+        struct blk_integrity_exchg bix;
+        struct bio_vec *bv;
+        sector_t sector = bio->bi_sector;
+        unsigned int i, sectors, total;
+        void *prot_buf = bio->bi_integrity->bip_buf;
+        total = 0;
+        bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
+        bix.sector_size = bi->sector_size;
+        bio_for_each_segment(bv, bio, i) {
+                void *kaddr = kmap_atomic(bv->bv_page, KM_USER0);
+                bix.data_buf = kaddr + bv->bv_offset;
+                bix.data_size = bv->bv_len;
+                bix.prot_buf = prot_buf;
+                bix.sector = sector;
+                bi->generate_fn(&bix);
+                sectors = bv->bv_len / bi->sector_size;
+                sector += sectors;
+                prot_buf += sectors * bi->tuple_size;
+                total += sectors * bi->tuple_size;
+                BUG_ON(total > bio->bi_integrity->bip_size);
+                kunmap_atomic(kaddr, KM_USER0);
+        }
+}
+/**
+ * bio_integrity_prep - Prepare bio for integrity I/O
+ * @bio:        bio to prepare
+ *
+ * Description: Allocates a buffer for integrity metadata, maps the
+ * pages and attaches them to a bio.  The bio must have data
+ * direction, target device and start sector set priot to calling.  In
+ * the WRITE case, integrity metadata will be generated using the
+ * block device's integrity function.  In the READ case, the buffer
+ * will be prepared for DMA and a suitable end_io handler set up.
+ */
+int bio_integrity_prep(struct bio *bio)
+{
+        struct bio_integrity_payload *bip;
+        struct blk_integrity *bi;
+        struct request_queue *q;
+        void *buf;
+        unsigned long start, end;
+        unsigned int len, nr_pages;
+        unsigned int bytes, offset, i;
+        unsigned int sectors;
+        bi = bdev_get_integrity(bio->bi_bdev);
+        q = bdev_get_queue(bio->bi_bdev);
+        BUG_ON(bi == NULL);
+        BUG_ON(bio_integrity(bio));
+        sectors = bio_integrity_hw_sectors(bi, bio_sectors(bio));
+        /* Allocate kernel buffer for protection data */
+        len = sectors * blk_integrity_tuple_size(bi);
+        buf = kmalloc(len, GFP_NOIO | __GFP_NOFAIL | q->bounce_gfp);
+        if (unlikely(buf == NULL)) {
+                printk(KERN_ERR "could not allocate integrity buffer\n");
+                return -EIO;
+        }
+        end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+        start = ((unsigned long) buf) >> PAGE_SHIFT;
+        nr_pages = end - start;
+        /* Allocate bio integrity payload and integrity vectors */
+        bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages);
+        if (unlikely(bip == NULL)) {
+                printk(KERN_ERR "could not allocate data integrity bioset\n");
+                kfree(buf);
+                return -EIO;
+        }
+        bip->bip_buf = buf;
+        bip->bip_size = len;
+        bip->bip_sector = bio->bi_sector;
+        /* Map it */
+        offset = offset_in_page(buf);
+        for (i = 0 ; i < nr_pages ; i++) {
+                int ret;
+                bytes = PAGE_SIZE - offset;
+                if (len <= 0)
+                        break;
+                if (bytes > len)
+                        bytes = len;
+                ret = bio_integrity_add_page(bio, virt_to_page(buf),
+                                             bytes, offset);
+                if (ret == 0)
+                        return 0;
+                if (ret < bytes)
+                        break;
+                buf += bytes;
+                len -= bytes;
+                offset = 0;
+        }
+        /* Install custom I/O completion handler if read verify is enabled */
+        if (bio_data_dir(bio) == READ) {
+                bip->bip_end_io = bio->bi_end_io;
+                bio->bi_end_io = bio_integrity_endio;
+        }
+        /* Auto-generate integrity metadata if this is a write */
+        if (bio_data_dir(bio) == WRITE)
+                bio_integrity_generate(bio);
+        return 0;
+}
+EXPORT_SYMBOL(bio_integrity_prep);
+/**
+ * bio_integrity_verify - Verify integrity metadata for a bio
+ * @bio:        bio to verify
+ *
+ * Description: This function is called to verify the integrity of a
+ * bio.  The data in the bio io_vec is compared to the integrity
+ * metadata returned by the HBA.
+ */
+static int bio_integrity_verify(struct bio *bio)
+{
+        struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+        struct blk_integrity_exchg bix;
+        struct bio_vec *bv;
+        sector_t sector = bio->bi_integrity->bip_sector;
+        unsigned int i, sectors, total, ret;
+        void *prot_buf = bio->bi_integrity->bip_buf;
+        ret = total = 0;
+        bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
+        bix.sector_size = bi->sector_size;
+        bio_for_each_segment(bv, bio, i) {
+                void *kaddr = kmap_atomic(bv->bv_page, KM_USER0);
+                bix.data_buf = kaddr + bv->bv_offset;
+                bix.data_size = bv->bv_len;
+                bix.prot_buf = prot_buf;
+                bix.sector = sector;
+                ret = bi->verify_fn(&bix);
+                if (ret) {
+                        kunmap_atomic(kaddr, KM_USER0);
+                        break;
+                }
+                sectors = bv->bv_len / bi->sector_size;
+                sector += sectors;
+                prot_buf += sectors * bi->tuple_size;
+                total += sectors * bi->tuple_size;
+                BUG_ON(total > bio->bi_integrity->bip_size);
+                kunmap_atomic(kaddr, KM_USER0);
+        }
+        return ret;
+}
+/**
+ * bio_integrity_verify_fn - Integrity I/O completion worker
+ * @work:       Work struct stored in bio to be verified
+ *
+ * Description: This workqueue function is called to complete a READ
+ * request.  The function verifies the transferred integrity metadata
+ * and then calls the original bio end_io function.
+ */
+static void bio_integrity_verify_fn(struct work_struct *work)
+{
+        struct bio_integrity_payload *bip =
+                container_of(work, struct bio_integrity_payload, bip_work);
+        struct bio *bio = bip->bip_bio;
+        int error = bip->bip_error;
+        if (bio_integrity_verify(bio)) {
+                clear_bit(BIO_UPTODATE, &bio->bi_flags);
+                error = -EIO;
+        }
+        /* Restore original bio completion handler */
+        bio->bi_end_io = bip->bip_end_io;
+        if (bio->bi_end_io)
+                bio->bi_end_io(bio, error);
+}
+/**
+ * bio_integrity_endio - Integrity I/O completion function
+ * @bio:        Protected bio
+ * @error:      Pointer to errno
+ *
+ * Description: Completion for integrity I/O
+ *
+ * Normally I/O completion is done in interrupt context.  However,
+ * verifying I/O integrity is a time-consuming task which must be run
+ * in process context.  This function postpones completion
+ * accordingly.
+ */
+void bio_integrity_endio(struct bio *bio, int error)
+{
+        struct bio_integrity_payload *bip = bio->bi_integrity;
+        BUG_ON(bip->bip_bio != bio);
+        bip->bip_error = error;
+        INIT_WORK(&bip->bip_work, bio_integrity_verify_fn);
+        queue_work(kintegrityd_wq, &bip->bip_work);
+}
+EXPORT_SYMBOL(bio_integrity_endio);
+/**
+ * bio_integrity_mark_head - Advance bip_vec skip bytes
+ * @bip:        Integrity vector to advance
+ * @skip:       Number of bytes to advance it
+ */
+void bio_integrity_mark_head(struct bio_integrity_payload *bip,
+                             unsigned int skip)
+{
+        struct bio_vec *iv;
+        unsigned int i;
+        bip_for_each_vec(iv, bip, i) {
+                if (skip == 0) {
+                        bip->bip_idx = i;
+                        return;
+                } else if (skip >= iv->bv_len) {
+                        skip -= iv->bv_len;
+                } else { /* skip < iv->bv_len) */
+                        iv->bv_offset += skip;
+                        iv->bv_len -= skip;
+                        bip->bip_idx = i;
+                        return;
+                }
+        }
+}
+/**
+ * bio_integrity_mark_tail - Truncate bip_vec to be len bytes long
+ * @bip:        Integrity vector to truncate
+ * @len:        New length of integrity vector
+ */
+void bio_integrity_mark_tail(struct bio_integrity_payload *bip,
+                             unsigned int len)
+{
+        struct bio_vec *iv;
+        unsigned int i;
+        bip_for_each_vec(iv, bip, i) {
+                if (len == 0) {
+                        bip->bip_vcnt = i;
+                        return;
+                } else if (len >= iv->bv_len) {
+                        len -= iv->bv_len;
+                } else { /* len < iv->bv_len) */
+                        iv->bv_len = len;
+                        len = 0;
+                }
+        }
+}
+/**
+ * bio_integrity_advance - Advance integrity vector
+ * @bio:        bio whose integrity vector to update
+ * @bytes_done: number of data bytes that have been completed
+ *
+ * Description: This function calculates how many integrity bytes the
+ * number of completed data bytes correspond to and advances the
+ * integrity vector accordingly.
+ */
+void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
+{
+        struct bio_integrity_payload *bip = bio->bi_integrity;
+        struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+        unsigned int nr_sectors;
+        BUG_ON(bip == NULL);
+        BUG_ON(bi == NULL);
+        nr_sectors = bio_integrity_hw_sectors(bi, bytes_done >> 9);
+        bio_integrity_mark_head(bip, nr_sectors * bi->tuple_size);
+}
+EXPORT_SYMBOL(bio_integrity_advance);
+/**
+ * bio_integrity_trim - Trim integrity vector
+ * @bio:        bio whose integrity vector to update
+ * @offset:     offset to first data sector
+ * @sectors:    number of data sectors
+ *
+ * Description: Used to trim the integrity vector in a cloned bio.
+ * The ivec will be advanced corresponding to 'offset' data sectors
+ * and the length will be truncated corresponding to 'len' data
+ * sectors.
+ */
+void bio_integrity_trim(struct bio *bio, unsigned int offset,
+                        unsigned int sectors)
+{
+        struct bio_integrity_payload *bip = bio->bi_integrity;
+        struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+        unsigned int nr_sectors;
+        BUG_ON(bip == NULL);
+        BUG_ON(bi == NULL);
+        BUG_ON(!bio_flagged(bio, BIO_CLONED));
+        nr_sectors = bio_integrity_hw_sectors(bi, sectors);
+        bip->bip_sector = bip->bip_sector + offset;
+        bio_integrity_mark_head(bip, offset * bi->tuple_size);
+        bio_integrity_mark_tail(bip, sectors * bi->tuple_size);
+}
+EXPORT_SYMBOL(bio_integrity_trim);
+/**
+ * bio_integrity_split - Split integrity metadata
+ * @bio:        Protected bio
+ * @bp:         Resulting bio_pair
+ * @sectors:    Offset
+ *
+ * Description: Splits an integrity page into a bio_pair.
+ */
+void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors)
+{
+        struct blk_integrity *bi;
+        struct bio_integrity_payload *bip = bio->bi_integrity;
+        unsigned int nr_sectors;
+        if (bio_integrity(bio) == 0)
+                return;
+        bi = bdev_get_integrity(bio->bi_bdev);
+        BUG_ON(bi == NULL);
+        BUG_ON(bip->bip_vcnt != 1);
+        nr_sectors = bio_integrity_hw_sectors(bi, sectors);
+        bp->bio1.bi_integrity = &bp->bip1;
+        bp->bio2.bi_integrity = &bp->bip2;
+        bp->iv1 = bip->bip_vec[0];
+        bp->iv2 = bip->bip_vec[0];
+        bp->bip1.bip_vec = &bp->iv1;
+        bp->bip2.bip_vec = &bp->iv2;
+        bp->iv1.bv_len = sectors * bi->tuple_size;
+        bp->iv2.bv_offset += sectors * bi->tuple_size;
+        bp->iv2.bv_len -= sectors * bi->tuple_size;
+        bp->bip1.bip_sector = bio->bi_integrity->bip_sector;
+        bp->bip2.bip_sector = bio->bi_integrity->bip_sector + nr_sectors;
+        bp->bip1.bip_vcnt = bp->bip2.bip_vcnt = 1;
+        bp->bip1.bip_idx = bp->bip2.bip_idx = 0;
+}
+EXPORT_SYMBOL(bio_integrity_split);
+/**
+ * bio_integrity_clone - Callback for cloning bios with integrity metadata
+ * @bio:        New bio
+ * @bio_src:    Original bio
+ * @bs:         bio_set to allocate bip from
+ *
+ * Description: Called to allocate a bip when cloning a bio
+ */
+int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
+                        struct bio_set *bs)
+{
+        struct bio_integrity_payload *bip_src = bio_src->bi_integrity;
+        struct bio_integrity_payload *bip;
+        BUG_ON(bip_src == NULL);
+        bip = bio_integrity_alloc_bioset(bio, GFP_NOIO, bip_src->bip_vcnt, bs);
+        if (bip == NULL)
+                return -EIO;
+        memcpy(bip->bip_vec, bip_src->bip_vec,
+               bip_src->bip_vcnt * sizeof(struct bio_vec));
+        bip->bip_sector = bip_src->bip_sector;
+        bip->bip_vcnt = bip_src->bip_vcnt;
+        bip->bip_idx = bip_src->bip_idx;
+        return 0;
+}
+EXPORT_SYMBOL(bio_integrity_clone);
+int bioset_integrity_create(struct bio_set *bs, int pool_size)
+{
+        bs->bio_integrity_pool = mempool_create_slab_pool(pool_size,
+                                                          bio_integrity_slab);
+        if (!bs->bio_integrity_pool)
+                return -1;
+        return 0;
+}
+EXPORT_SYMBOL(bioset_integrity_create);
+void bioset_integrity_free(struct bio_set *bs)
+{
+        if (bs->bio_integrity_pool)
+                mempool_destroy(bs->bio_integrity_pool);
+}
+EXPORT_SYMBOL(bioset_integrity_free);
+void __init bio_integrity_init_slab(void)
+{
+        bio_integrity_slab = KMEM_CACHE(bio_integrity_payload,
+                                        SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+}
+EXPORT_SYMBOL(bio_integrity_init_slab);
+static int __init integrity_init(void)
+{
+        kintegrityd_wq = create_workqueue("kintegrityd");
+        if (!kintegrityd_wq)
+                panic("Failed to create kintegrityd\n");
+        return 0;
+}
+subsys_initcall(integrity_init);
diff --git a/fs/bio.c b/fs/bio.c
index 78562574cb52..88322b066acb 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -28,25 +28,10 @@
 #include <linux/blktrace_api.h>
 #include <scsi/sg.h>            /* for struct sg_iovec */
-#define BIO_POOL_SIZE 2
 static struct kmem_cache *bio_slab __read_mostly;
-#define BIOVEC_NR_POOLS 6
-/*
- * a small number of entries is fine, not going to be performance critical.
- * basically we just need to survive
- */
-#define BIO_SPLIT_ENTRIES 2
 mempool_t *bio_split_pool __read_mostly;
-struct biovec_slab {
-        int nr_vecs;
-        char *name; 
-        struct kmem_cache *slab;
-};
 /*
 * if you change this list, also change bvec_alloc or things will
 * break badly! cannot be bigger than what you can fit into an
@@ -60,23 +45,17 @@ static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
 #undef BV
 /*
- * bio_set is used to allow other portions of the IO system to
- * allocate their own private memory pools for bio and iovec structures.
- * These memory pools in turn all allocate from the bio_slab
- * and the bvec_slabs[].
- */
-struct bio_set {
-        mempool_t *bio_pool;
-        mempool_t *bvec_pools[BIOVEC_NR_POOLS];
-};
-/*
 * fs_bio_set is the bio_set containing bio and iovec memory pools used by
 * IO code that does not need private memory pools.
 */
-static struct bio_set *fs_bio_set;
+struct bio_set *fs_bio_set;
+unsigned int bvec_nr_vecs(unsigned short idx)
+{
+        return bvec_slabs[idx].nr_vecs;
+}
-static inline struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs)
+struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs)
 {
        struct bio_vec *bvl;
@@ -117,6 +96,9 @@ void bio_free(struct bio *bio, struct bio_set *bio_set)
                mempool_free(bio->bi_io_vec, bio_set->bvec_pools[pool_idx]);
        }
+        if (bio_integrity(bio))
+                bio_integrity_free(bio, bio_set);
        mempool_free(bio, bio_set->bio_pool);
 }
@@ -275,9 +257,19 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
 {
        struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set);
-        if (b) {
+        if (!b)
-                b->bi_destructor = bio_fs_destructor;
+                return NULL;
-                __bio_clone(b, bio);
+        b->bi_destructor = bio_fs_destructor;
+        __bio_clone(b, bio);
+        if (bio_integrity(bio)) {
+                int ret;
+                ret = bio_integrity_clone(b, bio, fs_bio_set);
+                if (ret < 0)
+                        return NULL;
        }
        return b;
@@ -333,10 +325,19 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
                if (page == prev->bv_page &&
                    offset == prev->bv_offset + prev->bv_len) {
                        prev->bv_len += len;
-                        if (q->merge_bvec_fn &&
-                            q->merge_bvec_fn(q, bio, prev) < len) {
+                        if (q->merge_bvec_fn) {
-                                prev->bv_len -= len;
+                                struct bvec_merge_data bvm = {
-                                return 0;
+                                        .bi_bdev = bio->bi_bdev,
+                                        .bi_sector = bio->bi_sector,
+                                        .bi_size = bio->bi_size,
+                                        .bi_rw = bio->bi_rw,
+                                };
+                                if (q->merge_bvec_fn(q, &bvm, prev) < len) {
+                                        prev->bv_len -= len;
+                                        return 0;
+                                }
                        }
                        goto done;
@@ -377,11 +378,18 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
         * queue to get further control
         */
        if (q->merge_bvec_fn) {
+                struct bvec_merge_data bvm = {
+                        .bi_bdev = bio->bi_bdev,
+                        .bi_sector = bio->bi_sector,
+                        .bi_size = bio->bi_size,
+                        .bi_rw = bio->bi_rw,
+                };
                /*
                 * merge_bvec_fn() returns number of bytes it can accept
                 * at this offset
                 */
-                if (q->merge_bvec_fn(q, bio, bvec) < len) {
+                if (q->merge_bvec_fn(q, &bvm, bvec) < len) {
                        bvec->bv_page = NULL;
                        bvec->bv_len = 0;
                        bvec->bv_offset = 0;
@@ -1249,6 +1257,9 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
        bp->bio1.bi_private = bi;
        bp->bio2.bi_private = pool;
+        if (bio_integrity(bi))
+                bio_integrity_split(bi, bp, first_sectors);
        return bp;
 }
@@ -1290,6 +1301,7 @@ void bioset_free(struct bio_set *bs)
        if (bs->bio_pool)
                mempool_destroy(bs->bio_pool);
+        bioset_integrity_free(bs);
        biovec_free_pools(bs);
        kfree(bs);
@@ -1306,6 +1318,9 @@ struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size)
        if (!bs->bio_pool)
                goto bad;
+        if (bioset_integrity_create(bs, bio_pool_size))
+                goto bad;
        if (!biovec_create_pools(bs, bvec_pool_size))
                return bs;
@@ -1332,6 +1347,7 @@ static int __init init_bio(void)
 {
        bio_slab = KMEM_CACHE(bio, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+        bio_integrity_init_slab();
        biovec_init_slabs();
        fs_bio_set = bioset_create(BIO_POOL_SIZE, 2);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 470c10ceb0fb..10d8a0aa871a 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -931,8 +931,16 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
        struct gendisk *disk;
        int ret;
        int part;
+        int perm = 0;
-        ret = devcgroup_inode_permission(bdev->bd_inode, file->f_mode);
+        if (file->f_mode & FMODE_READ)
+                perm |= MAY_READ;
+        if (file->f_mode & FMODE_WRITE)
+                perm |= MAY_WRITE;
+        /*
+         * hooks: /n/, see "layering violations".
+         */
+        ret = devcgroup_inode_permission(bdev->bd_inode, perm);
        if (ret != 0)
                return ret;
diff --git a/fs/buffer.c b/fs/buffer.c
index a073f3f4f013..d48caee12e2a 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -821,7 +821,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
                                 * contents - it is a noop if I/O is still in
                                 * flight on potentially older contents.
                                 */
-                                ll_rw_block(SWRITE, 1, &bh);
+                                ll_rw_block(SWRITE_SYNC, 1, &bh);
                                brelse(bh);
                                spin_lock(lock);
                        }
@@ -1464,7 +1464,7 @@ static void invalidate_bh_lru(void *arg)
        
 void invalidate_bh_lrus(void)
 {
-        on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
+        on_each_cpu(invalidate_bh_lru, NULL, 1);
 }
 EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
@@ -1691,11 +1691,13 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
                         */
                        clear_buffer_dirty(bh);
                        set_buffer_uptodate(bh);
-                } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
+                } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
+                           buffer_dirty(bh)) {
                        WARN_ON(bh->b_size != blocksize);
                        err = get_block(inode, block, bh, 1);
                        if (err)
                                goto recover;
+                        clear_buffer_delay(bh);
                        if (buffer_new(bh)) {
                                /* blockdev mappings never come here */
                                clear_buffer_new(bh);
@@ -1774,7 +1776,8 @@ recover:
        bh = head;
        /* Recovery: lock and submit the mapped buffers */
        do {
-                if (buffer_mapped(bh) && buffer_dirty(bh)) {
+                if (buffer_mapped(bh) && buffer_dirty(bh) &&
+                    !buffer_delay(bh)) {
                        lock_buffer(bh);
                        mark_buffer_async_write(bh);
                } else {
@@ -2061,6 +2064,7 @@ int generic_write_end(struct file *file, struct address_space *mapping,
                        struct page *page, void *fsdata)
 {
        struct inode *inode = mapping->host;
+        int i_size_changed = 0;
        copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
@@ -2073,12 +2077,21 @@ int generic_write_end(struct file *file, struct address_space *mapping,
         */
        if (pos+copied > inode->i_size) {
                i_size_write(inode, pos+copied);
-                mark_inode_dirty(inode);
+                i_size_changed = 1;
        }
        unlock_page(page);
        page_cache_release(page);
+        /*
+         * Don't mark the inode dirty under page lock. First, it unnecessarily
+         * makes the holding time of page lock longer. Second, it forces lock
+         * ordering of page lock and transaction start for journaling
+         * filesystems.
+         */
+        if (i_size_changed)
+                mark_inode_dirty(inode);
        return copied;
 }
 EXPORT_SYMBOL(generic_write_end);
@@ -2940,16 +2953,19 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
        for (i = 0; i < nr; i++) {
                struct buffer_head *bh = bhs[i];
-                if (rw == SWRITE)
+                if (rw == SWRITE || rw == SWRITE_SYNC)
                        lock_buffer(bh);
                else if (test_set_buffer_locked(bh))
                        continue;
-                if (rw == WRITE || rw == SWRITE) {
+                if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) {
                        if (test_clear_buffer_dirty(bh)) {
                                bh->b_end_io = end_buffer_write_sync;
                                get_bh(bh);
-                                submit_bh(WRITE, bh);
+                                if (rw == SWRITE_SYNC)
+                                        submit_bh(WRITE_SYNC, bh);
+                                else
+                                        submit_bh(WRITE, bh);
                                continue;
                        }
                } else {
@@ -2978,7 +2994,7 @@ int sync_dirty_buffer(struct buffer_head *bh)
        if (test_clear_buffer_dirty(bh)) {
                get_bh(bh);
                bh->b_end_io = end_buffer_write_sync;
-                ret = submit_bh(WRITE, bh);
+                ret = submit_bh(WRITE_SYNC, bh);
                wait_on_buffer(bh);
                if (buffer_eopnotsupp(bh)) {
                        clear_buffer_eopnotsupp(bh);
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 68e510b88457..3cb7cda3d780 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -373,6 +373,8 @@ static int chrdev_open(struct inode *inode, struct file *filp)
                        return -ENXIO;
                new = container_of(kobj, struct cdev, kobj);
                spin_lock(&cdev_lock);
+                /* Check i_cdev again in case somebody beat us to it while
+                   we dropped the lock. */
                p = inode->i_cdev;
                if (!p) {
                        inode->i_cdev = p = new;
@@ -392,11 +394,8 @@ static int chrdev_open(struct inode *inode, struct file *filp)
                cdev_put(p);
                return -ENXIO;
        }
-        if (filp->f_op->open) {
+        if (filp->f_op->open)
-                lock_kernel();
                ret = filp->f_op->open(inode,filp);
-                unlock_kernel();
-        }
        if (ret)
                cdev_put(p);
        return ret;
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 28e3d5c5fcac..1f3465201fdf 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -2,6 +2,11 @@ Version 1.53
 ------------
 DFS support added (Microsoft Distributed File System client support needed
 for referrals which enable a hierarchical name space among servers).
+Disable temporary caching of mode bits to servers which do not support
+storing of mode (e.g. Windows servers, when client mounts without cifsacl
+mount option) and add new "dynperm" mount option to enable temporary caching
+of mode (enable old behavior).  Fix hang on mount caused when server crashes
+tcp session during negotiate protocol.
 Version 1.52
 ------------
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 34902cff5400..0e9fc2ba90ee 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -34,11 +34,11 @@
 static struct cifs_wksid wksidarr[NUM_WK_SIDS] = {
        {{1, 0, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0} }, "null user"},
        {{1, 1, {0, 0, 0, 0, 0, 1}, {0, 0, 0, 0, 0} }, "nobody"},
-        {{1, 1, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(11), 0, 0, 0, 0} }, "net-users"},
+        {{1, 1, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(11), 0, 0, 0, 0} }, "net-users"},
-        {{1, 1, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(18), 0, 0, 0, 0} }, "sys"},
+        {{1, 1, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(18), 0, 0, 0, 0} }, "sys"},
-        {{1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(32), cpu_to_le32(544), 0, 0, 0} }, "root"},
+        {{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(544), 0, 0, 0} }, "root"},
-        {{1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(32), cpu_to_le32(545), 0, 0, 0} }, "users"},
+        {{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(545), 0, 0, 0} }, "users"},
-        {{1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(32), cpu_to_le32(546), 0, 0, 0} }, "guest"} }
+        {{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(546), 0, 0, 0} }, "guest"} }
 ;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 5df93fd6303f..22857c639df5 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -97,9 +97,6 @@ cifs_read_super(struct super_block *sb, void *data,
 {
        struct inode *inode;
        struct cifs_sb_info *cifs_sb;
-#ifdef CONFIG_CIFS_DFS_UPCALL
-        int len;
-#endif
        int rc = 0;
        /* BB should we make this contingent on mount parm? */
@@ -117,15 +114,17 @@ cifs_read_super(struct super_block *sb, void *data,
         * complex operation (mount), and in case of fail
         * just exit instead of doing mount and attempting
         * undo it if this copy fails?*/
-        len = strlen(data);
+        if (data) {
-        cifs_sb->mountdata = kzalloc(len + 1, GFP_KERNEL);
+                int len = strlen(data);
-        if (cifs_sb->mountdata == NULL) {
+                cifs_sb->mountdata = kzalloc(len + 1, GFP_KERNEL);
-                kfree(sb->s_fs_info);
+                if (cifs_sb->mountdata == NULL) {
-                sb->s_fs_info = NULL;
+                        kfree(sb->s_fs_info);
-                return -ENOMEM;
+                        sb->s_fs_info = NULL;
+                        return -ENOMEM;
+                }
+                strncpy(cifs_sb->mountdata, data, len + 1);
+                cifs_sb->mountdata[len] = '\0';
        }
-        strncpy(cifs_sb->mountdata, data, len + 1);
-        cifs_sb->mountdata[len] = '\0';
 #endif
        rc = cifs_mount(sb, cifs_sb, data, devname);
@@ -613,7 +612,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
                if (retval < 0)
                        return (loff_t)retval;
        }
-        return remote_llseek(file, offset, origin);
+        return generic_file_llseek_unlocked(file, offset, origin);
 }
 struct file_system_type cifs_fs_type = {
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 08914053242b..9cfcf326ead3 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -333,7 +333,6 @@ struct cifsFileInfo {
        bool messageMode:1;     /* for pipes: message vs byte mode */
        atomic_t wrtPending;   /* handle in use - defer close */
        struct semaphore fh_sem; /* prevents reopen race after dead ses*/
-        char *search_resume_name; /* BB removeme BB */
        struct cifs_search_info srch_inf;
 };
@@ -626,7 +625,7 @@ GLOBAL_EXTERN atomic_t tcpSesAllocCount;
 GLOBAL_EXTERN atomic_t tcpSesReconnectCount;
 GLOBAL_EXTERN atomic_t tconInfoReconnectCount;
-/* Various Debug counters to remove someday (BB) */
+/* Various Debug counters */
 GLOBAL_EXTERN atomic_t bufAllocCount;    /* current number allocated  */
 #ifdef CONFIG_CIFS_STATS2
 GLOBAL_EXTERN atomic_t totBufAllocCount; /* total allocated over all time */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 65d58b4e6a61..0f327c224da3 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -79,6 +79,19 @@
 #define TRANS2_GET_DFS_REFERRAL       0x10
 #define TRANS2_REPORT_DFS_INCOSISTENCY 0x11
+/* SMB Transact (Named Pipe) subcommand codes */
+#define TRANS_SET_NMPIPE_STATE      0x0001
+#define TRANS_RAW_READ_NMPIPE       0x0011
+#define TRANS_QUERY_NMPIPE_STATE    0x0021
+#define TRANS_QUERY_NMPIPE_INFO     0x0022
+#define TRANS_PEEK_NMPIPE           0x0023
+#define TRANS_TRANSACT_NMPIPE       0x0026
+#define TRANS_RAW_WRITE_NMPIPE      0x0031
+#define TRANS_READ_NMPIPE           0x0036
+#define TRANS_WRITE_NMPIPE          0x0037
+#define TRANS_WAIT_NMPIPE           0x0053
+#define TRANS_CALL_NMPIPE           0x0054
 /* NT Transact subcommand codes */
 #define NT_TRANSACT_CREATE            0x01
 #define NT_TRANSACT_IOCTL             0x02
@@ -328,12 +341,13 @@
 #define CREATE_COMPLETE_IF_OPLK 0x00000100      /* should be zero */
 #define CREATE_NO_EA_KNOWLEDGE  0x00000200
 #define CREATE_EIGHT_DOT_THREE  0x00000400      /* doc says this is obsolete
-                                                 open for recovery flag - should
+                                                 "open for recovery" flag - should
-                                                 be zero */
+                                                 be zero in any case */
+#define CREATE_OPEN_FOR_RECOVERY 0x00000400
 #define CREATE_RANDOM_ACCESS    0x00000800
 #define CREATE_DELETE_ON_CLOSE  0x00001000
 #define CREATE_OPEN_BY_ID       0x00002000
-#define CREATE_OPEN_BACKUP_INTN 0x00004000
+#define CREATE_OPEN_BACKUP_INTENT 0x00004000
 #define CREATE_NO_COMPRESSION   0x00008000
 #define CREATE_RESERVE_OPFILTER 0x00100000      /* should be zero */
 #define OPEN_REPARSE_POINT      0x00200000
@@ -722,7 +736,6 @@ typedef struct smb_com_tconx_rsp_ext {
 #define SMB_CSC_CACHE_AUTO_REINT   0x0004
 #define SMB_CSC_CACHE_VDO          0x0008
 #define SMB_CSC_NO_CACHING         0x000C
 #define SMB_UNIQUE_FILE_NAME    0x0010
 #define SMB_EXTENDED_SIGNATURES 0x0020
@@ -806,7 +819,7 @@ typedef struct smb_com_findclose_req {
 #define ICOUNT_MASK             0x00FF
 #define PIPE_READ_MODE          0x0100
 #define NAMED_PIPE_TYPE         0x0400
-#define PIPE_END_POINT          0x0800
+#define PIPE_END_POINT          0x4000
 #define BLOCKING_NAMED_PIPE     0x8000
 typedef struct smb_com_open_req {       /* also handles create */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index fb655b4593c6..4511b708f0f3 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1728,7 +1728,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
 {
        int rc = 0;
        LOCK_REQ *pSMB = NULL;
-        LOCK_RSP *pSMBr = NULL;
+/*      LOCK_RSP *pSMBr = NULL; */ /* No response data other than rc to parse */
        int bytes_returned;
        int timeout = 0;
        __u16 count;
@@ -1739,8 +1739,6 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
        if (rc)
                return rc;
-        pSMBr = (LOCK_RSP *)pSMB; /* BB removeme BB */
        if (lockType == LOCKING_ANDX_OPLOCK_RELEASE) {
                timeout = CIFS_ASYNC_OP; /* no response expected */
                pSMB->Timeout = 0;
@@ -1774,7 +1772,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
        if (waitFlag) {
                rc = SendReceiveBlockingLock(xid, tcon, (struct smb_hdr *) pSMB,
-                        (struct smb_hdr *) pSMBr, &bytes_returned);
+                        (struct smb_hdr *) pSMB, &bytes_returned);
                cifs_small_buf_release(pSMB);
        } else {
                rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *)pSMB,
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 023434f72c15..e8fa46c7cff2 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -653,6 +653,7 @@ multi_t2_fnd:
        spin_lock(&GlobalMid_Lock);
        server->tcpStatus = CifsExiting;
        spin_unlock(&GlobalMid_Lock);
+        wake_up_all(&server->response_q);
        /* don't exit until kthread_stop is called */
        set_current_state(TASK_UNINTERRUPTIBLE);
@@ -2120,6 +2121,10 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
                        cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
                }
+                if ((volume_info.cifs_acl) && (volume_info.dynperm))
+                        cERROR(1, ("mount option dynperm ignored if cifsacl "
+                                   "mount option supported"));
                tcon =
                    find_unc(sin_server.sin_addr.s_addr, volume_info.UNC,
                             volume_info.username);
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index f0b5b5f3dd2e..fb69c1fa85c9 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -260,7 +260,9 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
                                                 buf, inode->i_sb, xid,
                                                 &fileHandle);
                        if (newinode) {
-                                newinode->i_mode = mode;
+                                if (cifs_sb->mnt_cifs_flags &
+                                    CIFS_MOUNT_DYNPERM)
+                                        newinode->i_mode = mode;
                                if ((oplock & CIFS_CREATE_ACTION) &&
                                    (cifs_sb->mnt_cifs_flags &
                                     CIFS_MOUNT_SET_UID)) {
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 8636cec2642c..0aac824371a5 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -546,7 +546,6 @@ int cifs_close(struct inode *inode, struct file *file)
                        msleep(timeout);
                        timeout *= 8;
                }
-                kfree(pSMBFile->search_resume_name);
                kfree(file->private_data);
                file->private_data = NULL;
        } else
@@ -605,12 +604,6 @@ int cifs_closedir(struct inode *inode, struct file *file)
                        else
                                cifs_buf_release(ptmp);
                }
-                ptmp = pCFileStruct->search_resume_name;
-                if (ptmp) {
-                        cFYI(1, ("closedir free resume name"));
-                        pCFileStruct->search_resume_name = NULL;
-                        kfree(ptmp);
-                }
                kfree(file->private_data);
                file->private_data = NULL;
        }
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 129dbfe4dca7..2e904bd111c8 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -219,15 +219,15 @@ int cifs_get_inode_info_unix(struct inode **pinode,
        rc = CIFSSMBUnixQPathInfo(xid, pTcon, full_path, &find_data,
                                  cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
-        if (rc) {
+        if (rc == -EREMOTE && !is_dfs_referral) {
-                if (rc == -EREMOTE && !is_dfs_referral) {
+                is_dfs_referral = true;
-                        is_dfs_referral = true;
+                cFYI(DBG2, ("DFS ref"));
-                        cFYI(DBG2, ("DFS ref"));
+                /* for DFS, server does not give us real inode data */
-                        /* for DFS, server does not give us real inode data */
+                fill_fake_finddataunix(&find_data, sb);
-                        fill_fake_finddataunix(&find_data, sb);
+                rc = 0;
-                        rc = 0;
+        } else if (rc)
-                }
+                goto cgiiu_exit;
-        }
        num_of_bytes = le64_to_cpu(find_data.NumOfBytes);
        end_of_file = le64_to_cpu(find_data.EndOfFile);
@@ -236,7 +236,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
                *pinode = new_inode(sb);
                if (*pinode == NULL) {
                        rc = -ENOMEM;
-                goto cgiiu_exit;
+                        goto cgiiu_exit;
                }
                /* Is an i_ino of zero legal? */
                /* note ino incremented to unique num in new_inode */
@@ -418,6 +418,7 @@ int cifs_get_inode_info(struct inode **pinode,
        char *buf = NULL;
        bool adjustTZ = false;
        bool is_dfs_referral = false;
+        umode_t default_mode;
        pTcon = cifs_sb->tcon;
        cFYI(1, ("Getting info on %s", full_path));
@@ -530,47 +531,42 @@ int cifs_get_inode_info(struct inode **pinode,
                inode->i_mtime.tv_sec += pTcon->ses->server->timeAdj;
        }
-        /* set default mode. will override for dirs below */
+        /* get default inode mode */
-        if (atomic_read(&cifsInfo->inUse) == 0)
+        if (attr & ATTR_DIRECTORY)
-                /* new inode, can safely set these fields */
+                default_mode = cifs_sb->mnt_dir_mode;
-                inode->i_mode = cifs_sb->mnt_file_mode;
+        else
-        else /* since we set the inode type below we need to mask off
+                default_mode = cifs_sb->mnt_file_mode;
-             to avoid strange results if type changes and both
-             get orred in */
+        /* set permission bits */
-                inode->i_mode &= ~S_IFMT;
+        if (atomic_read(&cifsInfo->inUse) == 0 ||
-/*      if (attr & ATTR_REPARSE)  */
+            (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) == 0)
-        /* We no longer handle these as symlinks because we could not
+                inode->i_mode = default_mode;
-           follow them due to the absolute path with drive letter */
+        else {
-        if (attr & ATTR_DIRECTORY) {
+                /* just reenable write bits if !ATTR_READONLY */
-        /* override default perms since we do not do byte range locking
+                if ((inode->i_mode & S_IWUGO) == 0 &&
-           on dirs */
+                    (attr & ATTR_READONLY) == 0)
-                inode->i_mode = cifs_sb->mnt_dir_mode;
+                        inode->i_mode |= (S_IWUGO & default_mode);
-                inode->i_mode |= S_IFDIR;
+                        inode->i_mode &= ~S_IFMT;
-        } else if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) &&
+        }
-                   (cifsInfo->cifsAttrs & ATTR_SYSTEM) &&
+        /* clear write bits if ATTR_READONLY is set */
-                   /* No need to le64 convert size of zero */
+        if (attr & ATTR_READONLY)
-                   (pfindData->EndOfFile == 0)) {
+                inode->i_mode &= ~S_IWUGO;
-                inode->i_mode = cifs_sb->mnt_file_mode;
-                inode->i_mode |= S_IFIFO;
+        /* set inode type */
-/* BB Finish for SFU style symlinks and devices */
+        if ((attr & ATTR_SYSTEM) &&
-        } else if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) &&
+            (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) {
-                   (cifsInfo->cifsAttrs & ATTR_SYSTEM)) {
+                /* no need to fix endianness on 0 */
-                if (decode_sfu_inode(inode, le64_to_cpu(pfindData->EndOfFile),
+                if (pfindData->EndOfFile == 0)
-                                     full_path, cifs_sb, xid))
+                        inode->i_mode |= S_IFIFO;
-                        cFYI(1, ("Unrecognized sfu inode type"));
+                else if (decode_sfu_inode(inode,
+                                le64_to_cpu(pfindData->EndOfFile),
-                cFYI(1, ("sfu mode 0%o", inode->i_mode));
+                                full_path, cifs_sb, xid))
+                        cFYI(1, ("unknown SFU file type\n"));
        } else {
-                inode->i_mode |= S_IFREG;
+                if (attr & ATTR_DIRECTORY)
-                /* treat dos attribute of read-only as read-only mode eg 555 */
+                        inode->i_mode |= S_IFDIR;
-                if (cifsInfo->cifsAttrs & ATTR_READONLY)
+                else
-                        inode->i_mode &= ~(S_IWUGO);
+                        inode->i_mode |= S_IFREG;
-                else if ((inode->i_mode & S_IWUGO) == 0)
-                        /* the ATTR_READONLY flag may have been */
-                        /* changed on server -- set any w bits  */
-                        /* allowed by mnt_file_mode             */
-                        inode->i_mode |= (S_IWUGO & cifs_sb->mnt_file_mode);
-        /* BB add code to validate if device or weird share or device type? */
        }
        spin_lock(&inode->i_lock);
@@ -1019,8 +1015,11 @@ mkdir_get_info:
                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
                        }
                        if (direntry->d_inode) {
-                                direntry->d_inode->i_mode = mode;
+                                if (cifs_sb->mnt_cifs_flags &
-                                direntry->d_inode->i_mode |= S_IFDIR;
+                                     CIFS_MOUNT_DYNPERM)
+                                        direntry->d_inode->i_mode =
+                                                (mode | S_IFDIR);
                                if (cifs_sb->mnt_cifs_flags &
                                     CIFS_MOUNT_SET_UID) {
                                        direntry->d_inode->i_uid =
@@ -1547,13 +1546,26 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
                } else
                        goto cifs_setattr_exit;
        }
-        if (attrs->ia_valid & ATTR_UID) {
-                cFYI(1, ("UID changed to %d", attrs->ia_uid));
+        /*
-                uid = attrs->ia_uid;
+         * Without unix extensions we can't send ownership changes to the
-        }
+         * server, so silently ignore them. This is consistent with how
-        if (attrs->ia_valid & ATTR_GID) {
+         * local DOS/Windows filesystems behave (VFAT, NTFS, etc). With
-                cFYI(1, ("GID changed to %d", attrs->ia_gid));
+         * CIFSACL support + proper Windows to Unix idmapping, we may be
-                gid = attrs->ia_gid;
+         * able to support this in the future.
+         */
+        if (!pTcon->unix_ext &&
+            !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)) {
+                attrs->ia_valid &= ~(ATTR_UID | ATTR_GID);
+        } else {
+                if (attrs->ia_valid & ATTR_UID) {
+                        cFYI(1, ("UID changed to %d", attrs->ia_uid));
+                        uid = attrs->ia_uid;
+                }
+                if (attrs->ia_valid & ATTR_GID) {
+                        cFYI(1, ("GID changed to %d", attrs->ia_gid));
+                        gid = attrs->ia_gid;
+                }
        }
        time_buf.Attributes = 0;
@@ -1563,7 +1575,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
                attrs->ia_valid &= ~ATTR_MODE;
        if (attrs->ia_valid & ATTR_MODE) {
-                cFYI(1, ("Mode changed to 0x%x", attrs->ia_mode));
+                cFYI(1, ("Mode changed to 0%o", attrs->ia_mode));
                mode = attrs->ia_mode;
        }
@@ -1578,18 +1590,18 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 #ifdef CONFIG_CIFS_EXPERIMENTAL
                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
                        rc = mode_to_acl(inode, full_path, mode);
-                else if ((mode & S_IWUGO) == 0) {
+                else
-#else
-                if ((mode & S_IWUGO) == 0) {
 #endif
-                        /* not writeable */
+                if (((mode & S_IWUGO) == 0) &&
-                        if ((cifsInode->cifsAttrs & ATTR_READONLY) == 0) {
+                    (cifsInode->cifsAttrs & ATTR_READONLY) == 0) {
-                                set_dosattr = true;
+                        set_dosattr = true;
-                                time_buf.Attributes =
+                        time_buf.Attributes = cpu_to_le32(cifsInode->cifsAttrs |
-                                        cpu_to_le32(cifsInode->cifsAttrs |
+                                                          ATTR_READONLY);
-                                                    ATTR_READONLY);
+                        /* fix up mode if we're not using dynperm */
-                        }
+                        if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) == 0)
-                } else if (cifsInode->cifsAttrs & ATTR_READONLY) {
+                                attrs->ia_mode = inode->i_mode & ~S_IWUGO;
+                } else if ((mode & S_IWUGO) &&
+                           (cifsInode->cifsAttrs & ATTR_READONLY)) {
                        /* If file is readonly on server, we would
                        not be able to write to it - so if any write
                        bit is enabled for user or group or other we
@@ -1600,6 +1612,20 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
                        /* Windows ignores set to zero */
                        if (time_buf.Attributes == 0)
                                time_buf.Attributes |= cpu_to_le32(ATTR_NORMAL);
+                        /* reset local inode permissions to normal */
+                        if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)) {
+                                attrs->ia_mode &= ~(S_IALLUGO);
+                                if (S_ISDIR(inode->i_mode))
+                                        attrs->ia_mode |=
+                                                cifs_sb->mnt_dir_mode;
+                                else
+                                        attrs->ia_mode |=
+                                                cifs_sb->mnt_file_mode;
+                        }
+                } else if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)) {
+                        /* ignore mode change - ATTR_READONLY hasn't changed */
+                        attrs->ia_valid &= ~ATTR_MODE;
                }
        }
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 1d69b8014e0b..4b17f8fe3157 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -519,8 +519,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
                        pnotify = (struct file_notify_information *)
                                ((char *)&pSMBr->hdr.Protocol + data_offset);
                        cFYI(1, ("dnotify on %s Action: 0x%x",
-                                 pnotify->FileName,
+                                 pnotify->FileName, pnotify->Action));
-                                pnotify->Action));  /* BB removeme BB */
                        /*   cifs_dump_mem("Rcvd notify Data: ",buf,
                                sizeof(struct smb_hdr)+60); */
                        return true;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 713c25110197..83f306954883 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -132,6 +132,7 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
        __u32 attr;
        __u64 allocation_size;
        __u64 end_of_file;
+        umode_t default_mode;
        /* save mtime and size */
        local_mtime = tmp_inode->i_mtime;
@@ -187,48 +188,54 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
        if (atomic_read(&cifsInfo->inUse) == 0) {
                tmp_inode->i_uid = cifs_sb->mnt_uid;
                tmp_inode->i_gid = cifs_sb->mnt_gid;
-                /* set default mode. will override for dirs below */
+        }
-                tmp_inode->i_mode = cifs_sb->mnt_file_mode;
-        } else {
+        if (attr & ATTR_DIRECTORY)
-                /* mask off the type bits since it gets set
+                default_mode = cifs_sb->mnt_dir_mode;
-                below and we do not want to get two type
+        else
-                bits set */
+                default_mode = cifs_sb->mnt_file_mode;
+        /* set initial permissions */
+        if ((atomic_read(&cifsInfo->inUse) == 0) ||
+            (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) == 0)
+                tmp_inode->i_mode = default_mode;
+        else {
+                /* just reenable write bits if !ATTR_READONLY */
+                if ((tmp_inode->i_mode & S_IWUGO) == 0 &&
+                    (attr & ATTR_READONLY) == 0)
+                        tmp_inode->i_mode |= (S_IWUGO & default_mode);
                tmp_inode->i_mode &= ~S_IFMT;
        }
-        if (attr & ATTR_DIRECTORY) {
+        /* clear write bits if ATTR_READONLY is set */
-                *pobject_type = DT_DIR;
+        if (attr & ATTR_READONLY)
-                /* override default perms since we do not lock dirs */
+                tmp_inode->i_mode &= ~S_IWUGO;
-                if (atomic_read(&cifsInfo->inUse) == 0)
-                        tmp_inode->i_mode = cifs_sb->mnt_dir_mode;
+        /* set inode type */
-                tmp_inode->i_mode |= S_IFDIR;
+        if ((attr & ATTR_SYSTEM) &&
-        } else if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) &&
+            (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) {
-                   (attr & ATTR_SYSTEM)) {
                if (end_of_file == 0)  {
-                        *pobject_type = DT_FIFO;
                        tmp_inode->i_mode |= S_IFIFO;
+                        *pobject_type = DT_FIFO;
                } else {
-                        /* rather than get the type here, we mark the
+                        /*
-                        inode as needing revalidate and get the real type
+                         * trying to get the type can be slow, so just call
-                        (blk vs chr vs. symlink) later ie in lookup */
+                         * this a regular file for now, and mark for reval
-                        *pobject_type = DT_REG;
+                         */
                        tmp_inode->i_mode |= S_IFREG;
+                        *pobject_type = DT_REG;
                        cifsInfo->time = 0;
                }
-/* we no longer mark these because we could not follow them */
-/*        } else if (attr & ATTR_REPARSE) {
-                *pobject_type = DT_LNK;
-                tmp_inode->i_mode |= S_IFLNK; */
        } else {
-                *pobject_type = DT_REG;
+                if (attr & ATTR_DIRECTORY) {
-                tmp_inode->i_mode |= S_IFREG;
+                        tmp_inode->i_mode |= S_IFDIR;
-                if (attr & ATTR_READONLY)
+                        *pobject_type = DT_DIR;
-                        tmp_inode->i_mode &= ~(S_IWUGO);
+                } else {
-                else if ((tmp_inode->i_mode & S_IWUGO) == 0)
+                        tmp_inode->i_mode |= S_IFREG;
-                        /* the ATTR_READONLY flag may have been changed on   */
+                        *pobject_type = DT_REG;
-                        /* server -- set any w bits allowed by mnt_file_mode */
+                }
-                        tmp_inode->i_mode |= (S_IWUGO & cifs_sb->mnt_file_mode);
+        }
-        } /* could add code here - to validate if device or weird share type? */
        /* can not fill in nlink here as in qpathinfo version and Unx search */
        if (atomic_read(&cifsInfo->inUse) == 0)
@@ -675,8 +682,6 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
                        cifsFile->invalidHandle = true;
                        CIFSFindClose(xid, pTcon, cifsFile->netfid);
                }
-                kfree(cifsFile->search_resume_name);
-                cifsFile->search_resume_name = NULL;
                if (cifsFile->srch_inf.ntwrk_buf_start) {
                        cFYI(1, ("freeing SMB ff cache buf on search rewind"));
                        if (cifsFile->srch_inf.smallBuf)
@@ -1043,9 +1048,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
                } /* else {
                        cifsFile->invalidHandle = true;
                        CIFSFindClose(xid, pTcon, cifsFile->netfid);
-                }
+                } */
-                kfree(cifsFile->search_resume_name);
-                cifsFile->search_resume_name = NULL; */
                rc = find_cifs_entry(xid, pTcon, file,
                                &current_entry, &num_to_fill);
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 97dba0d92348..c54eaab71a19 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -69,9 +69,11 @@
 #include <linux/capi.h>
 #include <linux/gigaset_dev.h>
+#ifdef CONFIG_BLOCK
 #include <scsi/scsi.h>
 #include <scsi/scsi_ioctl.h>
 #include <scsi/sg.h>
+#endif
 #include <asm/uaccess.h>
 #include <linux/ethtool.h>
@@ -2024,6 +2026,7 @@ COMPATIBLE_IOCTL(GIO_UNISCRNMAP)
 COMPATIBLE_IOCTL(PIO_UNISCRNMAP)
 COMPATIBLE_IOCTL(PIO_FONTRESET)
 COMPATIBLE_IOCTL(PIO_UNIMAPCLR)
+#ifdef CONFIG_BLOCK
 /* Big S */
 COMPATIBLE_IOCTL(SCSI_IOCTL_GET_IDLUN)
 COMPATIBLE_IOCTL(SCSI_IOCTL_DOORLOCK)
@@ -2033,6 +2036,7 @@ COMPATIBLE_IOCTL(SCSI_IOCTL_GET_BUS_NUMBER)
 COMPATIBLE_IOCTL(SCSI_IOCTL_SEND_COMMAND)
 COMPATIBLE_IOCTL(SCSI_IOCTL_PROBE_HOST)
 COMPATIBLE_IOCTL(SCSI_IOCTL_GET_PCI)
+#endif
 /* Big T */
 COMPATIBLE_IOCTL(TUNSETNOCSUM)
 COMPATIBLE_IOCTL(TUNSETDEBUG)
@@ -2103,6 +2107,7 @@ COMPATIBLE_IOCTL(SIOCGIFVLAN)
 COMPATIBLE_IOCTL(SIOCSIFVLAN)
 COMPATIBLE_IOCTL(SIOCBRADDBR)
 COMPATIBLE_IOCTL(SIOCBRDELBR)
+#ifdef CONFIG_BLOCK
 /* SG stuff */
 COMPATIBLE_IOCTL(SG_SET_TIMEOUT)
 COMPATIBLE_IOCTL(SG_GET_TIMEOUT)
@@ -2127,6 +2132,7 @@ COMPATIBLE_IOCTL(SG_SCSI_RESET)
 COMPATIBLE_IOCTL(SG_GET_REQUEST_TABLE)
 COMPATIBLE_IOCTL(SG_SET_KEEP_ORPHAN)
 COMPATIBLE_IOCTL(SG_GET_KEEP_ORPHAN)
+#endif
 /* PPP stuff */
 COMPATIBLE_IOCTL(PPPIOCGFLAGS)
 COMPATIBLE_IOCTL(PPPIOCSFLAGS)
diff --git a/fs/dcache.c b/fs/dcache.c
index 3ee588d5f585..6068c25b393c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -17,6 +17,7 @@
 #include <linux/syscalls.h>
 #include <linux/string.h>
 #include <linux/mm.h>
+#include <linux/fdtable.h>
 #include <linux/fs.h>
 #include <linux/fsnotify.h>
 #include <linux/slab.h>
@@ -106,9 +107,10 @@ static void dentry_lru_remove(struct dentry *dentry)
 /*
 * Release the dentry's inode, using the filesystem
 * d_iput() operation if defined.
- * Called with dcache_lock and per dentry lock held, drops both.
 */
 static void dentry_iput(struct dentry * dentry)
+        __releases(dentry->d_lock)
+        __releases(dcache_lock)
 {
        struct inode *inode = dentry->d_inode;
        if (inode) {
@@ -132,12 +134,13 @@ static void dentry_iput(struct dentry * dentry)
 * d_kill - kill dentry and return parent
 * @dentry: dentry to kill
 *
- * Called with dcache_lock and d_lock, releases both.  The dentry must
+ * The dentry must already be unhashed and removed from the LRU.
- * already be unhashed and removed from the LRU.
 *
 * If this is the root of the dentry tree, return NULL.
 */
 static struct dentry *d_kill(struct dentry *dentry)
+        __releases(dentry->d_lock)
+        __releases(dcache_lock)
 {
        struct dentry *parent;
@@ -383,11 +386,11 @@ restart:
 * Try to prune ancestors as well.  This is necessary to prevent
 * quadratic behavior of shrink_dcache_parent(), but is also expected
 * to be beneficial in reducing dentry cache fragmentation.
- *
- * Called with dcache_lock, drops it and then regains.
- * Called with dentry->d_lock held, drops it.
 */
 static void prune_one_dentry(struct dentry * dentry)
+        __releases(dentry->d_lock)
+        __releases(dcache_lock)
+        __acquires(dcache_lock)
 {
        __d_drop(dentry);
        dentry = d_kill(dentry);
@@ -1604,10 +1607,9 @@ static int d_isparent(struct dentry *p1, struct dentry *p2)
 *
 * Note: If ever the locking in lock_rename() changes, then please
 * remember to update this too...
- *
- * On return, dcache_lock will have been unlocked.
 */
 static struct dentry *__d_unalias(struct dentry *dentry, struct dentry *alias)
+        __releases(dcache_lock)
 {
        struct mutex *m1 = NULL, *m2 = NULL;
        struct dentry *ret;
@@ -1743,11 +1745,9 @@ out_nolock:
 shouldnt_be_hashed:
        spin_unlock(&dcache_lock);
        BUG();
-        goto shouldnt_be_hashed;
 }
-static int prepend(char **buffer, int *buflen, const char *str,
+static int prepend(char **buffer, int *buflen, const char *str, int namelen)
-                          int namelen)
 {
        *buflen -= namelen;
        if (*buflen < 0)
@@ -1757,8 +1757,13 @@ static int prepend(char **buffer, int *buflen, const char *str,
        return 0;
 }
+static int prepend_name(char **buffer, int *buflen, struct qstr *name)
+{
+        return prepend(buffer, buflen, name->name, name->len);
+}
 /**
- * d_path - return the path of a dentry
+ * __d_path - return the path of a dentry
 * @path: the dentry/vfsmount to report
 * @root: root vfsmnt/dentry (may be modified by this function)
 * @buffer: buffer to return value in
@@ -1779,9 +1784,10 @@ char *__d_path(const struct path *path, struct path *root,
 {
        struct dentry *dentry = path->dentry;
        struct vfsmount *vfsmnt = path->mnt;
-        char * end = buffer+buflen;
+        char *end = buffer + buflen;
-        char * retval;
+        char *retval;
+        spin_lock(&vfsmount_lock);
        prepend(&end, &buflen, "\0", 1);
        if (!IS_ROOT(dentry) && d_unhashed(dentry) &&
                (prepend(&end, &buflen, " (deleted)", 10) != 0))
@@ -1800,38 +1806,37 @@ char *__d_path(const struct path *path, struct path *root,
                        break;
                if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
                        /* Global root? */
-                        spin_lock(&vfsmount_lock);
                        if (vfsmnt->mnt_parent == vfsmnt) {
-                                spin_unlock(&vfsmount_lock);
                                goto global_root;
                        }
                        dentry = vfsmnt->mnt_mountpoint;
                        vfsmnt = vfsmnt->mnt_parent;
-                        spin_unlock(&vfsmount_lock);
                        continue;
                }
                parent = dentry->d_parent;
                prefetch(parent);
-                if ((prepend(&end, &buflen, dentry->d_name.name,
+                if ((prepend_name(&end, &buflen, &dentry->d_name) != 0) ||
-                                dentry->d_name.len) != 0) ||
                    (prepend(&end, &buflen, "/", 1) != 0))
                        goto Elong;
                retval = end;
                dentry = parent;
        }
+out:
+        spin_unlock(&vfsmount_lock);
        return retval;
 global_root:
        retval += 1;    /* hit the slash */
-        if (prepend(&retval, &buflen, dentry->d_name.name,
+        if (prepend_name(&retval, &buflen, &dentry->d_name) != 0)
-                    dentry->d_name.len) != 0)
                goto Elong;
        root->mnt = vfsmnt;
        root->dentry = dentry;
-        return retval;
+        goto out;
 Elong:
-        return ERR_PTR(-ENAMETOOLONG);
+        retval = ERR_PTR(-ENAMETOOLONG);
+        goto out;
 }
 /**
@@ -1845,9 +1850,9 @@ Elong:
 *
 * Returns the buffer or an error code if the path was too long.
 *
- * "buflen" should be positive. Caller holds the dcache_lock.
+ * "buflen" should be positive.
 */
-char *d_path(struct path *path, char *buf, int buflen)
+char *d_path(const struct path *path, char *buf, int buflen)
 {
        char *res;
        struct path root;
@@ -1915,16 +1920,11 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen)
        retval = end-1;
        *retval = '/';
-        for (;;) {
+        while (!IS_ROOT(dentry)) {
-                struct dentry *parent;
+                struct dentry *parent = dentry->d_parent;
-                if (IS_ROOT(dentry))
-                        break;
-                parent = dentry->d_parent;
                prefetch(parent);
+                if ((prepend_name(&end, &buflen, &dentry->d_name) != 0) ||
-                if ((prepend(&end, &buflen, dentry->d_name.name,
-                                dentry->d_name.len) != 0) ||
                    (prepend(&end, &buflen, "/", 1) != 0))
                        goto Elong;
@@ -1975,7 +1975,7 @@ asmlinkage long sys_getcwd(char __user *buf, unsigned long size)
        error = -ENOENT;
        /* Has the current directory has been unlinked? */
        spin_lock(&dcache_lock);
-        if (pwd.dentry->d_parent == pwd.dentry || !d_unhashed(pwd.dentry)) {
+        if (IS_ROOT(pwd.dentry) || !d_unhashed(pwd.dentry)) {
                unsigned long len;
                struct path tmp = root;
                char * cwd;
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index ebbcf38fd33b..f976f303c196 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -15,6 +15,7 @@
 #include <linux/poll.h>
 #include <linux/signal.h>
 #include <linux/spinlock.h>
+#include <linux/smp_lock.h>
 #include <linux/dlm.h>
 #include <linux/dlm_device.h>
@@ -618,13 +619,17 @@ static int device_open(struct inode *inode, struct file *file)
        struct dlm_user_proc *proc;
        struct dlm_ls *ls;
+        lock_kernel();
        ls = dlm_find_lockspace_device(iminor(inode));
-        if (!ls)
+        if (!ls) {
+                unlock_kernel();
                return -ENOENT;
+        }
        proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL);
        if (!proc) {
                dlm_put_lockspace(ls);
+                unlock_kernel();
                return -ENOMEM;
        }
@@ -636,6 +641,7 @@ static int device_open(struct inode *inode, struct file *file)
        spin_lock_init(&proc->locks_spin);
        init_waitqueue_head(&proc->wait);
        file->private_data = proc;
+        unlock_kernel();
        return 0;
 }
@@ -870,6 +876,7 @@ static unsigned int device_poll(struct file *file, poll_table *wait)
 static int ctl_device_open(struct inode *inode, struct file *file)
 {
+        cycle_kernel_lock();
        file->private_data = NULL;
        return 0;
 }
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 2258b8f654a6..24749bf0668f 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -30,6 +30,7 @@
 #include <linux/security.h>
 #include <linux/compat.h>
 #include <linux/fs_stack.h>
+#include <linux/smp_lock.h>
 #include "ecryptfs_kernel.h"
 /**
@@ -277,9 +278,11 @@ static int ecryptfs_fasync(int fd, struct file *file, int flag)
        int rc = 0;
        struct file *lower_file = NULL;
+        lock_kernel();
        lower_file = ecryptfs_file_to_lower(file);
        if (lower_file->f_op && lower_file->f_op->fasync)
                rc = lower_file->f_op->fasync(fd, lower_file, flag);
+        unlock_kernel();
        return rc;
 }
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 50c994a249a5..09a4522f65e6 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -575,13 +575,11 @@ int ecryptfs_init_ecryptfs_miscdev(void)
        int rc;
        atomic_set(&ecryptfs_num_miscdev_opens, 0);
-        mutex_lock(&ecryptfs_daemon_hash_mux);
        rc = misc_register(&ecryptfs_miscdev);
        if (rc)
                printk(KERN_ERR "%s: Failed to register miscellaneous device "
                       "for communications with userspace daemons; rc = [%d]\n",
                       __func__, rc);
-        mutex_unlock(&ecryptfs_daemon_hash_mux);
        return rc;
 }
diff --git a/fs/exec.c b/fs/exec.c
index 9448f1b50b4a..fd9234379e8d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -26,7 +26,6 @@
 #include <linux/file.h>
 #include <linux/fdtable.h>
 #include <linux/mman.h>
-#include <linux/a.out.h>
 #include <linux/stat.h>
 #include <linux/fcntl.h>
 #include <linux/smp_lock.h>
@@ -61,6 +60,11 @@
 #include <linux/kmod.h>
 #endif
+#ifdef __alpha__
+/* for /sbin/loader handling in search_binary_handler() */
+#include <linux/a.out.h>
+#endif
 int core_uses_pid;
 char core_pattern[CORENAME_MAX_SIZE] = "core";
 int suid_dumpable = 0;
@@ -606,7 +610,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
        bprm->exec -= stack_shift;
        down_write(&mm->mmap_sem);
-        vm_flags = vma->vm_flags;
+        vm_flags = VM_STACK_FLAGS;
        /*
         * Adjust stack execute permissions; explicitly enable for
@@ -1155,7 +1159,7 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 {
        int try,retval;
        struct linux_binfmt *fmt;
-#if defined(__alpha__) && defined(CONFIG_ARCH_SUPPORTS_AOUT)
+#ifdef __alpha__
        /* handle /sbin/loader.. */
        {
            struct exec * eh = (struct exec *) bprm->buf;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index fe3119a71ada..2845425077e8 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2875,8 +2875,10 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
                blk++;
        }
 out:
-        if (len == towrite)
+        if (len == towrite) {
+                mutex_unlock(&inode->i_mutex);
                return err;
+        }
        if (inode->i_size < off+len-towrite) {
                i_size_write(inode, off+len-towrite);
                EXT3_I(inode)->i_disksize = inode->i_size;
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 9cc80b9cc8d8..495ab21b9832 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -47,7 +47,7 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
                        ext4_group_t block_group)
 {
        ext4_group_t actual_group;
-        ext4_get_group_no_and_offset(sb, block, &actual_group, 0);
+        ext4_get_group_no_and_offset(sb, block, &actual_group, NULL);
        if (actual_group == block_group)
                return 1;
        return 0;
@@ -121,12 +121,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
                                le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
                }
        } else { /* For META_BG_BLOCK_GROUPS */
-                int group_rel = (block_group -
+                bit_max += ext4_bg_num_gdb(sb, block_group);
-                                 le32_to_cpu(sbi->s_es->s_first_meta_bg)) %
-                                EXT4_DESC_PER_BLOCK(sb);
-                if (group_rel == 0 || group_rel == 1 ||
-                    (group_rel == EXT4_DESC_PER_BLOCK(sb) - 1))
-                        bit_max += 1;
        }
        if (block_group == sbi->s_groups_count - 1) {
@@ -295,7 +290,7 @@ err_out:
        return 0;
 }
 /**
- * read_block_bitmap()
+ * ext4_read_block_bitmap()
 * @sb:                 super block
 * @block_group:        given block group
 *
@@ -305,7 +300,7 @@ err_out:
 * Return buffer_head on success or NULL in case of failure.
 */
 struct buffer_head *
-read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
+ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 {
        struct ext4_group_desc * desc;
        struct buffer_head * bh = NULL;
@@ -409,8 +404,7 @@ restart:
                prev = rsv;
        }
        printk("Window map complete.\n");
-        if (bad)
+        BUG_ON(bad);
-                BUG();
 }
 #define rsv_window_dump(root, verbose) \
        __rsv_window_dump((root), (verbose), __func__)
@@ -694,7 +688,7 @@ do_more:
                count -= overflow;
        }
        brelse(bitmap_bh);
-        bitmap_bh = read_block_bitmap(sb, block_group);
+        bitmap_bh = ext4_read_block_bitmap(sb, block_group);
        if (!bitmap_bh)
                goto error_return;
        desc = ext4_get_group_desc (sb, block_group, &gd_bh);
@@ -810,6 +804,13 @@ do_more:
        spin_unlock(sb_bgl_lock(sbi, block_group));
        percpu_counter_add(&sbi->s_freeblocks_counter, count);
+        if (sbi->s_log_groups_per_flex) {
+                ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+                spin_lock(sb_bgl_lock(sbi, flex_group));
+                sbi->s_flex_groups[flex_group].free_blocks += count;
+                spin_unlock(sb_bgl_lock(sbi, flex_group));
+        }
        /* We dirtied the bitmap block */
        BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
        err = ext4_journal_dirty_metadata(handle, bitmap_bh);
@@ -1598,23 +1599,35 @@ out:
 /**
 * ext4_has_free_blocks()
- * @sbi:                in-core super block structure.
+ * @sbi:        in-core super block structure.
+ * @nblocks:    number of neeed blocks
 *
- * Check if filesystem has at least 1 free block available for allocation.
+ * Check if filesystem has free blocks available for allocation.
+ * Return the number of blocks avaible for allocation for this request
+ * On success, return nblocks
 */
-static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
+ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
+                                                ext4_fsblk_t nblocks)
 {
-        ext4_fsblk_t free_blocks, root_blocks;
+        ext4_fsblk_t free_blocks;
+        ext4_fsblk_t root_blocks = 0;
        free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
-        root_blocks = ext4_r_blocks_count(sbi->s_es);
-        if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
+        if (!capable(CAP_SYS_RESOURCE) &&
                sbi->s_resuid != current->fsuid &&
-                (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
+                (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
-                return 0;
+                root_blocks = ext4_r_blocks_count(sbi->s_es);
-        }
+#ifdef CONFIG_SMP
-        return 1;
+        if (free_blocks - root_blocks < FBC_BATCH)
-}
+                free_blocks =
+                        percpu_counter_sum_and_set(&sbi->s_freeblocks_counter);
+#endif
+        if (free_blocks - root_blocks < nblocks)
+                return free_blocks - root_blocks;
+        return nblocks;
+ }
 /**
 * ext4_should_retry_alloc()
@@ -1630,7 +1643,7 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
 */
 int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 {
-        if (!ext4_has_free_blocks(EXT4_SB(sb)) || (*retries)++ > 3)
+        if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || (*retries)++ > 3)
                return 0;
        jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
@@ -1639,20 +1652,24 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 }
 /**
- * ext4_new_blocks_old() -- core block(s) allocation function
+ * ext4_old_new_blocks() -- core block bitmap based block allocation function
+ *
 * @handle:             handle to this transaction
 * @inode:              file inode
 * @goal:               given target block(filesystem wide)
 * @count:              target number of blocks to allocate
 * @errp:               error code
 *
- * ext4_new_blocks uses a goal block to assist allocation.  It tries to
+ * ext4_old_new_blocks uses a goal block to assist allocation and look up
- * allocate block(s) from the block group contains the goal block first. If that
+ * the block bitmap directly to do block allocation.  It tries to
- * fails, it will try to allocate block(s) from other block groups without
+ * allocate block(s) from the block group contains the goal block first. If
- * any specific goal block.
+ * that fails, it will try to allocate block(s) from other block groups
+ * without any specific goal block.
+ *
+ * This function is called when -o nomballoc mount option is enabled
 *
 */
-ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
+ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
                        ext4_fsblk_t goal, unsigned long *count, int *errp)
 {
        struct buffer_head *bitmap_bh = NULL;
@@ -1676,13 +1693,26 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
        ext4_group_t ngroups;
        unsigned long num = *count;
-        *errp = -ENOSPC;
        sb = inode->i_sb;
        if (!sb) {
+                *errp = -ENODEV;
                printk("ext4_new_block: nonexistent device");
                return 0;
        }
+        sbi = EXT4_SB(sb);
+        if (!EXT4_I(inode)->i_delalloc_reserved_flag) {
+                /*
+                 * With delalloc we already reserved the blocks
+                 */
+                *count = ext4_has_free_blocks(sbi, *count);
+        }
+        if (*count == 0) {
+                *errp = -ENOSPC;
+                return 0;       /*return with ENOSPC error */
+        }
+        num = *count;
        /*
         * Check quota for allocation of this block.
         */
@@ -1706,11 +1736,6 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
        if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
                my_rsv = &block_i->rsv_window_node;
-        if (!ext4_has_free_blocks(sbi)) {
-                *errp = -ENOSPC;
-                goto out;
-        }
        /*
         * First, test whether the goal block is free.
         */
@@ -1734,7 +1759,7 @@ retry_alloc:
                my_rsv = NULL;
        if (free_blocks > 0) {
-                bitmap_bh = read_block_bitmap(sb, group_no);
+                bitmap_bh = ext4_read_block_bitmap(sb, group_no);
                if (!bitmap_bh)
                        goto io_error;
                grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
@@ -1770,7 +1795,7 @@ retry_alloc:
                        continue;
                brelse(bitmap_bh);
-                bitmap_bh = read_block_bitmap(sb, group_no);
+                bitmap_bh = ext4_read_block_bitmap(sb, group_no);
                if (!bitmap_bh)
                        goto io_error;
                /*
@@ -1882,7 +1907,15 @@ allocated:
        le16_add_cpu(&gdp->bg_free_blocks_count, -num);
        gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
        spin_unlock(sb_bgl_lock(sbi, group_no));
-        percpu_counter_sub(&sbi->s_freeblocks_counter, num);
+        if (!EXT4_I(inode)->i_delalloc_reserved_flag)
+                percpu_counter_sub(&sbi->s_freeblocks_counter, num);
+        if (sbi->s_log_groups_per_flex) {
+                ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
+                spin_lock(sb_bgl_lock(sbi, flex_group));
+                sbi->s_flex_groups[flex_group].free_blocks -= num;
+                spin_unlock(sb_bgl_lock(sbi, flex_group));
+        }
        BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
        err = ext4_journal_dirty_metadata(handle, gdp_bh);
@@ -1915,46 +1948,104 @@ out:
        return 0;
 }
-ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode,
+#define EXT4_META_BLOCK 0x1
-                ext4_fsblk_t goal, int *errp)
+static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
+                                ext4_lblk_t iblock, ext4_fsblk_t goal,
+                                unsigned long *count, int *errp, int flags)
 {
        struct ext4_allocation_request ar;
        ext4_fsblk_t ret;
        if (!test_opt(inode->i_sb, MBALLOC)) {
-                unsigned long count = 1;
+                return ext4_old_new_blocks(handle, inode, goal, count, errp);
-                ret = ext4_new_blocks_old(handle, inode, goal, &count, errp);
-                return ret;
        }
        memset(&ar, 0, sizeof(ar));
+        /* Fill with neighbour allocated blocks */
        ar.inode = inode;
        ar.goal = goal;
-        ar.len = 1;
+        ar.len = *count;
+        ar.logical = iblock;
+        if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK))
+                /* enable in-core preallocation for data block allocation */
+                ar.flags = EXT4_MB_HINT_DATA;
+        else
+                /* disable in-core preallocation for non-regular files */
+                ar.flags = 0;
        ret = ext4_mb_new_blocks(handle, &ar, errp);
+        *count = ar.len;
        return ret;
 }
-ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+/*
+ * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
+ *
+ * @handle:             handle to this transaction
+ * @inode:              file inode
+ * @goal:               given target block(filesystem wide)
+ * @count:              total number of blocks need
+ * @errp:               error code
+ *
+ * Return 1st allocated block numberon success, *count stores total account
+ * error stores in errp pointer
+ */
+ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
                ext4_fsblk_t goal, unsigned long *count, int *errp)
 {
-        struct ext4_allocation_request ar;
        ext4_fsblk_t ret;
+        ret = do_blk_alloc(handle, inode, 0, goal,
-        if (!test_opt(inode->i_sb, MBALLOC)) {
+                                count, errp, EXT4_META_BLOCK);
-                ret = ext4_new_blocks_old(handle, inode, goal, count, errp);
+        /*
-                return ret;
+         * Account for the allocated meta blocks
+         */
+        if (!(*errp)) {
+                spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+                EXT4_I(inode)->i_allocated_meta_blocks += *count;
+                spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
        }
-        memset(&ar, 0, sizeof(ar));
-        ar.inode = inode;
-        ar.goal = goal;
-        ar.len = *count;
-        ret = ext4_mb_new_blocks(handle, &ar, errp);
-        *count = ar.len;
        return ret;
 }
+/*
+ * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks
+ *
+ * @handle:             handle to this transaction
+ * @inode:              file inode
+ * @goal:               given target block(filesystem wide)
+ * @errp:               error code
+ *
+ * Return allocated block number on success
+ */
+ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
+                ext4_fsblk_t goal, int *errp)
+{
+        unsigned long count = 1;
+        return ext4_new_meta_blocks(handle, inode, goal, &count, errp);
+}
+/*
+ * ext4_new_blocks() -- allocate data blocks
+ *
+ * @handle:             handle to this transaction
+ * @inode:              file inode
+ * @goal:               given target block(filesystem wide)
+ * @count:              total number of blocks need
+ * @errp:               error code
+ *
+ * Return 1st allocated block numberon success, *count stores total account
+ * error stores in errp pointer
+ */
+ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+                                ext4_lblk_t iblock, ext4_fsblk_t goal,
+                                unsigned long *count, int *errp)
+{
+        return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0);
+}
 /**
 * ext4_count_free_blocks() -- count filesystem free blocks
@@ -1986,7 +2077,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
                        continue;
                desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
                brelse(bitmap_bh);
-                bitmap_bh = read_block_bitmap(sb, i);
+                bitmap_bh = ext4_read_block_bitmap(sb, i);
                if (bitmap_bh == NULL)
                        continue;
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 2bf0331ea194..d3d23d73c08b 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -129,7 +129,8 @@ static int ext4_readdir(struct file * filp,
                struct buffer_head *bh = NULL;
                map_bh.b_state = 0;
-                err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, 0, 0);
+                err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh,
+                                                0, 0, 0);
                if (err > 0) {
                        pgoff_t index = map_bh.b_blocknr >>
                                        (PAGE_CACHE_SHIFT - inode->i_blkbits);
@@ -272,7 +273,7 @@ static void free_rb_tree_fname(struct rb_root *root)
        while (n) {
                /* Do the node's children first */
-                if ((n)->rb_left) {
+                if (n->rb_left) {
                        n = n->rb_left;
                        continue;
                }
@@ -301,24 +302,18 @@ static void free_rb_tree_fname(struct rb_root *root)
                        parent->rb_right = NULL;
                n = parent;
        }
-        root->rb_node = NULL;
 }
-static struct dir_private_info *create_dir_info(loff_t pos)
+static struct dir_private_info *ext4_htree_create_dir_info(loff_t pos)
 {
        struct dir_private_info *p;
-        p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL);
+        p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
        if (!p)
                return NULL;
-        p->root.rb_node = NULL;
-        p->curr_node = NULL;
-        p->extra_fname = NULL;
-        p->last_pos = 0;
        p->curr_hash = pos2maj_hash(pos);
        p->curr_minor_hash = pos2min_hash(pos);
-        p->next_hash = 0;
        return p;
 }
@@ -433,7 +428,7 @@ static int ext4_dx_readdir(struct file * filp,
        int     ret;
        if (!info) {
-                info = create_dir_info(filp->f_pos);
+                info = ext4_htree_create_dir_info(filp->f_pos);
                if (!info)
                        return -ENOMEM;
                filp->private_data = info;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8158083f7ac0..303e41cf7b14 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -22,7 +22,7 @@
 #include "ext4_i.h"
 /*
- * The second extended filesystem constants/structures
+ * The fourth extended filesystem constants/structures
 */
 /*
@@ -45,7 +45,7 @@
 #define ext4_debug(f, a...)                                             \
        do {                                                            \
                printk (KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:",       \
-                        __FILE__, __LINE__, __FUNCTION__);              \
+                        __FILE__, __LINE__, __func__);                  \
                printk (KERN_DEBUG f, ## a);                            \
        } while (0)
 #else
@@ -74,6 +74,9 @@
 #define EXT4_MB_HINT_GOAL_ONLY          256
 /* goal is meaningful */
 #define EXT4_MB_HINT_TRY_GOAL           512
+/* blocks already pre-reserved by delayed allocation */
+#define EXT4_MB_DELALLOC_RESERVED      1024
 struct ext4_allocation_request {
        /* target inode for block we're allocating */
@@ -170,6 +173,15 @@ struct ext4_group_desc
        __u32   bg_reserved2[3];
 };
+/*
+ * Structure of a flex block group info
+ */
+struct flex_groups {
+        __u32 free_inodes;
+        __u32 free_blocks;
+};
 #define EXT4_BG_INODE_UNINIT    0x0001 /* Inode table/bitmap not in use */
 #define EXT4_BG_BLOCK_UNINIT    0x0002 /* Block bitmap not in use */
 #define EXT4_BG_INODE_ZEROED    0x0004 /* On-disk itable initialized to zero */
@@ -527,6 +539,7 @@ do {									       \
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
 #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
 #define EXT4_MOUNT_MBALLOC              0x4000000 /* Buddy allocation support */
+#define EXT4_MOUNT_DELALLOC             0x8000000 /* Delalloc support */
 /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
 #define clear_opt(o, opt)               o &= ~EXT4_MOUNT_##opt
@@ -647,7 +660,10 @@ struct ext4_super_block {
        __le16  s_mmp_interval;         /* # seconds to wait in MMP checking */
        __le64  s_mmp_block;            /* Block for multi-mount protection */
        __le32  s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
-        __u32   s_reserved[163];        /* Padding to the end of the block */
+        __u8    s_log_groups_per_flex;  /* FLEX_BG group size */
+        __u8    s_reserved_char_pad2;
+        __le16  s_reserved_pad;
+        __u32   s_reserved[162];        /* Padding to the end of the block */
 };
 #ifdef __KERNEL__
@@ -958,12 +974,17 @@ extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
 extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
 extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
                        ext4_group_t group);
-extern ext4_fsblk_t ext4_new_block (handle_t *handle, struct inode *inode,
+extern ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
                        ext4_fsblk_t goal, int *errp);
-extern ext4_fsblk_t ext4_new_blocks (handle_t *handle, struct inode *inode,
+extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
                        ext4_fsblk_t goal, unsigned long *count, int *errp);
-extern ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
+extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+                                        ext4_lblk_t iblock, ext4_fsblk_t goal,
+                                        unsigned long *count, int *errp);
+extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
                        ext4_fsblk_t goal, unsigned long *count, int *errp);
+extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
+                                                ext4_fsblk_t nblocks);
 extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
                        ext4_fsblk_t block, unsigned long count, int metadata);
 extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb,
@@ -1016,9 +1037,14 @@ extern int __init init_ext4_mballoc(void);
 extern void exit_ext4_mballoc(void);
 extern void ext4_mb_free_blocks(handle_t *, struct inode *,
                unsigned long, unsigned long, int, unsigned long *);
+extern int ext4_mb_add_more_groupinfo(struct super_block *sb,
+                ext4_group_t i, struct ext4_group_desc *desc);
+extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
+                ext4_grpblk_t add);
 /* inode.c */
+void ext4_da_release_space(struct inode *inode, int used, int to_free);
 int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
                struct buffer_head *bh, ext4_fsblk_t blocknr);
 struct buffer_head *ext4_getblk(handle_t *, struct inode *,
@@ -1033,19 +1059,23 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
 extern struct inode *ext4_iget(struct super_block *, unsigned long);
 extern int  ext4_write_inode (struct inode *, int);
 extern int  ext4_setattr (struct dentry *, struct iattr *);
+extern int  ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
+                                struct kstat *stat);
 extern void ext4_delete_inode (struct inode *);
 extern int  ext4_sync_inode (handle_t *, struct inode *);
 extern void ext4_discard_reservation (struct inode *);
 extern void ext4_dirty_inode(struct inode *);
 extern int ext4_change_inode_journal_flag(struct inode *, int);
 extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
+extern int ext4_can_truncate(struct inode *inode);
 extern void ext4_truncate (struct inode *);
 extern void ext4_set_inode_flags(struct inode *);
 extern void ext4_get_inode_flags(struct ext4_inode_info *);
 extern void ext4_set_aops(struct inode *inode);
 extern int ext4_writepage_trans_blocks(struct inode *);
-extern int ext4_block_truncate_page(handle_t *handle, struct page *page,
+extern int ext4_block_truncate_page(handle_t *handle,
                struct address_space *mapping, loff_t from);
+extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
 /* ioctl.c */
 extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
@@ -1159,10 +1189,21 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
 }
+static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
+                                             ext4_group_t block_group)
+{
+        return block_group >> sbi->s_log_groups_per_flex;
+}
+static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
+{
+        return 1 << sbi->s_log_groups_per_flex;
+}
 #define ext4_std_error(sb, errno)                               \
 do {                                                            \
        if ((errno))                                            \
-                __ext4_std_error((sb), __FUNCTION__, (errno));  \
+                __ext4_std_error((sb), __func__, (errno));      \
 } while (0)
 /*
@@ -1191,7 +1232,7 @@ extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                        ext4_lblk_t iblock,
                        unsigned long max_blocks, struct buffer_head *bh_result,
                        int create, int extend_disksize);
-extern void ext4_ext_truncate(struct inode *, struct page *);
+extern void ext4_ext_truncate(struct inode *);
 extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
 extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
@@ -1199,7 +1240,7 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
 extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
                        sector_t block, unsigned long max_blocks,
                        struct buffer_head *bh, int create,
-                        int extend_disksize);
+                        int extend_disksize, int flag);
 #endif  /* __KERNEL__ */
 #endif  /* _EXT4_H */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 75333b595fab..6c166c0a54b7 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -212,6 +212,7 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
                (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
 }
+extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
 extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
 extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
 extern int ext4_extent_tree_init(handle_t *, struct inode *);
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index 26a4ae255d79..ef7409f0e7e4 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -79,7 +79,7 @@ struct ext4_ext_cache {
 };
 /*
- * third extended file system inode data in memory
+ * fourth extended file system inode data in memory
 */
 struct ext4_inode_info {
        __le32  i_data[15];     /* unconverted */
@@ -150,6 +150,7 @@ struct ext4_inode_info {
         */
        struct rw_semaphore i_data_sem;
        struct inode vfs_inode;
+        struct jbd2_inode jinode;
        unsigned long i_ext_generation;
        struct ext4_ext_cache i_cached_extent;
@@ -162,6 +163,13 @@ struct ext4_inode_info {
        /* mballoc */
        struct list_head i_prealloc_list;
        spinlock_t i_prealloc_lock;
+        /* allocation reservation info for delalloc */
+        unsigned long i_reserved_data_blocks;
+        unsigned long i_reserved_meta_blocks;
+        unsigned long i_allocated_meta_blocks;
+        unsigned short i_delalloc_reserved_flag;
+        spinlock_t i_block_reservation_lock;
 };
 #endif  /* _EXT4_I */
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 9255a7d28b24..eb8bc3afe6e9 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -142,19 +142,17 @@ int __ext4_journal_dirty_metadata(const char *where,
                                handle_t *handle, struct buffer_head *bh);
 #define ext4_journal_get_undo_access(handle, bh) \
-        __ext4_journal_get_undo_access(__FUNCTION__, (handle), (bh))
+        __ext4_journal_get_undo_access(__func__, (handle), (bh))
 #define ext4_journal_get_write_access(handle, bh) \
-        __ext4_journal_get_write_access(__FUNCTION__, (handle), (bh))
+        __ext4_journal_get_write_access(__func__, (handle), (bh))
 #define ext4_journal_revoke(handle, blocknr, bh) \
-        __ext4_journal_revoke(__FUNCTION__, (handle), (blocknr), (bh))
+        __ext4_journal_revoke(__func__, (handle), (blocknr), (bh))
 #define ext4_journal_get_create_access(handle, bh) \
-        __ext4_journal_get_create_access(__FUNCTION__, (handle), (bh))
+        __ext4_journal_get_create_access(__func__, (handle), (bh))
 #define ext4_journal_dirty_metadata(handle, bh) \
-        __ext4_journal_dirty_metadata(__FUNCTION__, (handle), (bh))
+        __ext4_journal_dirty_metadata(__func__, (handle), (bh))
 #define ext4_journal_forget(handle, bh) \
-        __ext4_journal_forget(__FUNCTION__, (handle), (bh))
+        __ext4_journal_forget(__func__, (handle), (bh))
-int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh);
 handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
 int __ext4_journal_stop(const char *where, handle_t *handle);
@@ -165,7 +163,7 @@ static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
 }
 #define ext4_journal_stop(handle) \
-        __ext4_journal_stop(__FUNCTION__, (handle))
+        __ext4_journal_stop(__func__, (handle))
 static inline handle_t *ext4_journal_current_handle(void)
 {
@@ -192,6 +190,11 @@ static inline int ext4_journal_force_commit(journal_t *journal)
        return jbd2_journal_force_commit(journal);
 }
+static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
+{
+        return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
+}
 /* super.c */
 int ext4_force_commit(struct super_block *sb);
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 5802e69f2191..6300226d5531 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -25,7 +25,7 @@
 #include <linux/rbtree.h>
 /*
- * third extended-fs super-block data in memory
+ * fourth extended-fs super-block data in memory
 */
 struct ext4_sb_info {
        unsigned long s_desc_size;      /* Size of a group descriptor in bytes */
@@ -143,6 +143,9 @@ struct ext4_sb_info {
        /* locality groups */
        struct ext4_locality_group *s_locality_groups;
+        unsigned int s_log_groups_per_flex;
+        struct flex_groups *s_flex_groups;
 };
 #endif  /* _EXT4_SB */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 47929c4e3dae..42c4c0c892ed 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -92,17 +92,16 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
        ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
 }
-static handle_t *ext4_ext_journal_restart(handle_t *handle, int needed)
+static int ext4_ext_journal_restart(handle_t *handle, int needed)
 {
        int err;
        if (handle->h_buffer_credits > needed)
-                return handle;
+                return 0;
-        if (!ext4_journal_extend(handle, needed))
+        err = ext4_journal_extend(handle, needed);
-                return handle;
+        if (err)
-        err = ext4_journal_restart(handle, needed);
+                return err;
+        return ext4_journal_restart(handle, needed);
-        return handle;
 }
 /*
@@ -180,15 +179,18 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
        return bg_start + colour + block;
 }
+/*
+ * Allocation for a meta data block
+ */
 static ext4_fsblk_t
-ext4_ext_new_block(handle_t *handle, struct inode *inode,
+ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
                        struct ext4_ext_path *path,
                        struct ext4_extent *ex, int *err)
 {
        ext4_fsblk_t goal, newblock;
        goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
-        newblock = ext4_new_block(handle, inode, goal, err);
+        newblock = ext4_new_meta_block(handle, inode, goal, err);
        return newblock;
 }
@@ -246,6 +248,36 @@ static int ext4_ext_space_root_idx(struct inode *inode)
        return size;
 }
+/*
+ * Calculate the number of metadata blocks needed
+ * to allocate @blocks
+ * Worse case is one block per extent
+ */
+int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
+{
+        int lcap, icap, rcap, leafs, idxs, num;
+        int newextents = blocks;
+        rcap = ext4_ext_space_root_idx(inode);
+        lcap = ext4_ext_space_block(inode);
+        icap = ext4_ext_space_block_idx(inode);
+        /* number of new leaf blocks needed */
+        num = leafs = (newextents + lcap - 1) / lcap;
+        /*
+         * Worse case, we need separate index block(s)
+         * to link all new leaf blocks
+         */
+        idxs = (leafs + icap - 1) / icap;
+        do {
+                num += idxs;
+                idxs = (idxs + icap - 1) / icap;
+        } while (idxs > rcap);
+        return num;
+}
 static int
 ext4_ext_max_entries(struct inode *inode, int depth)
 {
@@ -524,6 +556,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
                alloc = 1;
        }
        path[0].p_hdr = eh;
+        path[0].p_bh = NULL;
        i = depth;
        /* walk through the tree */
@@ -552,12 +585,14 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
        }
        path[ppos].p_depth = i;
-        path[ppos].p_hdr = eh;
        path[ppos].p_ext = NULL;
        path[ppos].p_idx = NULL;
        /* find extent */
        ext4_ext_binsearch(inode, path + ppos, block);
+        /* if not an empty leaf */
+        if (path[ppos].p_ext)
+                path[ppos].p_block = ext_pblock(path[ppos].p_ext);
        ext4_ext_show_path(inode, path);
@@ -688,7 +723,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
        /* allocate all needed blocks */
        ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
        for (a = 0; a < depth - at; a++) {
-                newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
+                newblock = ext4_ext_new_meta_block(handle, inode, path,
+                                                   newext, &err);
                if (newblock == 0)
                        goto cleanup;
                ablocks[a] = newblock;
@@ -884,7 +920,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
        ext4_fsblk_t newblock;
        int err = 0;
-        newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
+        newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err);
        if (newblock == 0)
                return err;
@@ -981,6 +1017,8 @@ repeat:
                /* if we found index with free entry, then use that
                 * entry: create all needed subtree and add new leaf */
                err = ext4_ext_split(handle, inode, path, newext, i);
+                if (err)
+                        goto out;
                /* refill path */
                ext4_ext_drop_refs(path);
@@ -1883,11 +1921,9 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
 #endif
-                handle = ext4_ext_journal_restart(handle, credits);
+                err = ext4_ext_journal_restart(handle, credits);
-                if (IS_ERR(handle)) {
+                if (err)
-                        err = PTR_ERR(handle);
                        goto out;
-                }
                err = ext4_ext_get_access(handle, inode, path + depth);
                if (err)
@@ -2529,6 +2565,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        int err = 0, depth, ret;
        unsigned long allocated = 0;
        struct ext4_allocation_request ar;
+        loff_t disksize;
        __clear_bit(BH_New, &bh_result->b_state);
        ext_debug("blocks %u/%lu requested for inode %u\n",
@@ -2616,8 +2653,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                                 */
                                if (allocated > max_blocks)
                                        allocated = max_blocks;
-                                /* mark the buffer unwritten */
+                                set_buffer_unwritten(bh_result);
-                                __set_bit(BH_Unwritten, &bh_result->b_state);
                                goto out2;
                        }
@@ -2716,14 +2752,19 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                goto out2;
        }
-        if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize)
-                EXT4_I(inode)->i_disksize = inode->i_size;
        /* previous routine could use block we allocated */
        newblock = ext_pblock(&newex);
        allocated = ext4_ext_get_actual_len(&newex);
 outnew:
-        __set_bit(BH_New, &bh_result->b_state);
+        if (extend_disksize) {
+                disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits;
+                if (disksize > i_size_read(inode))
+                        disksize = i_size_read(inode);
+                if (disksize > EXT4_I(inode)->i_disksize)
+                        EXT4_I(inode)->i_disksize = disksize;
+        }
+        set_buffer_new(bh_result);
        /* Cache only when it is _not_ an uninitialized extent */
        if (create != EXT4_CREATE_UNINITIALIZED_EXT)
@@ -2733,7 +2774,7 @@ out:
        if (allocated > max_blocks)
                allocated = max_blocks;
        ext4_ext_show_leaf(inode, path);
-        __set_bit(BH_Mapped, &bh_result->b_state);
+        set_buffer_mapped(bh_result);
        bh_result->b_bdev = inode->i_sb->s_bdev;
        bh_result->b_blocknr = newblock;
 out2:
@@ -2744,7 +2785,7 @@ out2:
        return err ? err : allocated;
 }
-void ext4_ext_truncate(struct inode * inode, struct page *page)
+void ext4_ext_truncate(struct inode *inode)
 {
        struct address_space *mapping = inode->i_mapping;
        struct super_block *sb = inode->i_sb;
@@ -2757,18 +2798,14 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
         */
        err = ext4_writepage_trans_blocks(inode) + 3;
        handle = ext4_journal_start(inode, err);
-        if (IS_ERR(handle)) {
+        if (IS_ERR(handle))
-                if (page) {
-                        clear_highpage(page);
-                        flush_dcache_page(page);
-                        unlock_page(page);
-                        page_cache_release(page);
-                }
                return;
-        }
-        if (page)
+        if (inode->i_size & (sb->s_blocksize - 1))
-                ext4_block_truncate_page(handle, page, mapping, inode->i_size);
+                ext4_block_truncate_page(handle, mapping, inode->i_size);
+        if (ext4_orphan_add(handle, inode))
+                goto out_stop;
        down_write(&EXT4_I(inode)->i_data_sem);
        ext4_ext_invalidate_cache(inode);
@@ -2780,8 +2817,6 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
         * Probably we need not scan at all,
         * because page truncation is enough.
         */
-        if (ext4_orphan_add(handle, inode))
-                goto out_stop;
        /* we have to know where to truncate from in crash case */
        EXT4_I(inode)->i_disksize = inode->i_size;
@@ -2798,6 +2833,7 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
                handle->h_sync = 1;
 out_stop:
+        up_write(&EXT4_I(inode)->i_data_sem);
        /*
         * If this was a simple ftruncate() and the file will remain alive,
         * then we need to clear up the orphan record which we created above.
@@ -2808,7 +2844,6 @@ out_stop:
        if (inode->i_nlink)
                ext4_orphan_del(handle, inode);
-        up_write(&EXT4_I(inode)->i_data_sem);
        inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
        ext4_mark_inode_dirty(handle, inode);
        ext4_journal_stop(handle);
@@ -2911,7 +2946,7 @@ retry:
                }
                ret = ext4_get_blocks_wrap(handle, inode, block,
                                          max_blocks, &map_bh,
-                                          EXT4_CREATE_UNINITIALIZED_EXT, 0);
+                                          EXT4_CREATE_UNINITIALIZED_EXT, 0, 0);
                if (ret <= 0) {
 #ifdef EXT4FS_DEBUG
                        WARN_ON(ret <= 0);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 4159be6366ab..430eb7978db4 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -123,6 +123,23 @@ force_commit:
        return ret;
 }
+static struct vm_operations_struct ext4_file_vm_ops = {
+        .fault          = filemap_fault,
+        .page_mkwrite   = ext4_page_mkwrite,
+};
+static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+        struct address_space *mapping = file->f_mapping;
+        if (!mapping->a_ops->readpage)
+                return -ENOEXEC;
+        file_accessed(file);
+        vma->vm_ops = &ext4_file_vm_ops;
+        vma->vm_flags |= VM_CAN_NONLINEAR;
+        return 0;
+}
 const struct file_operations ext4_file_operations = {
        .llseek         = generic_file_llseek,
        .read           = do_sync_read,
@@ -133,7 +150,7 @@ const struct file_operations ext4_file_operations = {
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ext4_compat_ioctl,
 #endif
-        .mmap           = generic_file_mmap,
+        .mmap           = ext4_file_mmap,
        .open           = generic_file_open,
        .release        = ext4_release_file,
        .fsync          = ext4_sync_file,
@@ -144,6 +161,7 @@ const struct file_operations ext4_file_operations = {
 const struct inode_operations ext4_file_inode_operations = {
        .truncate       = ext4_truncate,
        .setattr        = ext4_setattr,
+        .getattr        = ext4_getattr,
 #ifdef CONFIG_EXT4DEV_FS_XATTR
        .setxattr       = generic_setxattr,
        .getxattr       = generic_getxattr,
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 1c8ba48d4f8d..a45c3737ad31 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -27,6 +27,7 @@
 #include <linux/sched.h>
 #include <linux/writeback.h>
 #include <linux/jbd2.h>
+#include <linux/blkdev.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
@@ -45,6 +46,7 @@
 int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
 {
        struct inode *inode = dentry->d_inode;
+        journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
        int ret = 0;
        J_ASSERT(ext4_journal_current_handle() == NULL);
@@ -85,6 +87,8 @@ int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
                        .nr_to_write = 0, /* sys_fsync did this */
                };
                ret = sync_inode(inode, &wbc);
+                if (journal && (journal->j_flags & JBD2_BARRIER))
+                        blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
        }
 out:
        return ret;
diff --git a/fs/ext4/group.h b/fs/ext4/group.h
index 7eb0604e7eea..c2c0a8d06d0e 100644
--- a/fs/ext4/group.h
+++ b/fs/ext4/group.h
@@ -13,7 +13,7 @@ extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
                                   struct ext4_group_desc *gdp);
 extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
                                       struct ext4_group_desc *gdp);
-struct buffer_head *read_block_bitmap(struct super_block *sb,
+struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
                                      ext4_group_t block_group);
 extern unsigned ext4_init_block_bitmap(struct super_block *sb,
                                       struct buffer_head *bh,
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index c6efbab0c801..a92eb305344f 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -157,6 +157,7 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
        struct ext4_super_block * es;
        struct ext4_sb_info *sbi;
        int fatal = 0, err;
+        ext4_group_t flex_group;
        if (atomic_read(&inode->i_count) > 1) {
                printk ("ext4_free_inode: inode has count=%d\n",
@@ -232,6 +233,12 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
                        if (is_directory)
                                percpu_counter_dec(&sbi->s_dirs_counter);
+                        if (sbi->s_log_groups_per_flex) {
+                                flex_group = ext4_flex_group(sbi, block_group);
+                                spin_lock(sb_bgl_lock(sbi, flex_group));
+                                sbi->s_flex_groups[flex_group].free_inodes++;
+                                spin_unlock(sb_bgl_lock(sbi, flex_group));
+                        }
                }
                BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
                err = ext4_journal_dirty_metadata(handle, bh2);
@@ -286,6 +293,80 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
        return ret;
 }
+#define free_block_ratio 10
+static int find_group_flex(struct super_block *sb, struct inode *parent,
+                           ext4_group_t *best_group)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_group_desc *desc;
+        struct buffer_head *bh;
+        struct flex_groups *flex_group = sbi->s_flex_groups;
+        ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
+        ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group);
+        ext4_group_t ngroups = sbi->s_groups_count;
+        int flex_size = ext4_flex_bg_size(sbi);
+        ext4_group_t best_flex = parent_fbg_group;
+        int blocks_per_flex = sbi->s_blocks_per_group * flex_size;
+        int flexbg_free_blocks;
+        int flex_freeb_ratio;
+        ext4_group_t n_fbg_groups;
+        ext4_group_t i;
+        n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >>
+                sbi->s_log_groups_per_flex;
+find_close_to_parent:
+        flexbg_free_blocks = flex_group[best_flex].free_blocks;
+        flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
+        if (flex_group[best_flex].free_inodes &&
+            flex_freeb_ratio > free_block_ratio)
+                goto found_flexbg;
+        if (best_flex && best_flex == parent_fbg_group) {
+                best_flex--;
+                goto find_close_to_parent;
+        }
+        for (i = 0; i < n_fbg_groups; i++) {
+                if (i == parent_fbg_group || i == parent_fbg_group - 1)
+                        continue;
+                flexbg_free_blocks = flex_group[i].free_blocks;
+                flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
+                if (flex_freeb_ratio > free_block_ratio &&
+                    flex_group[i].free_inodes) {
+                        best_flex = i;
+                        goto found_flexbg;
+                }
+                if (best_flex < 0 ||
+                    (flex_group[i].free_blocks >
+                     flex_group[best_flex].free_blocks &&
+                     flex_group[i].free_inodes))
+                        best_flex = i;
+        }
+        if (!flex_group[best_flex].free_inodes ||
+            !flex_group[best_flex].free_blocks)
+                return -1;
+found_flexbg:
+        for (i = best_flex * flex_size; i < ngroups &&
+                     i < (best_flex + 1) * flex_size; i++) {
+                desc = ext4_get_group_desc(sb, i, &bh);
+                if (le16_to_cpu(desc->bg_free_inodes_count)) {
+                        *best_group = i;
+                        goto out;
+                }
+        }
+        return -1;
+out:
+        return 0;
+}
 /*
 * Orlov's allocator for directories.
 *
@@ -501,6 +582,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
        struct inode *ret;
        ext4_group_t i;
        int free = 0;
+        ext4_group_t flex_group;
        /* Cannot create files in a deleted directory */
        if (!dir || !dir->i_nlink)
@@ -514,6 +596,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
        sbi = EXT4_SB(sb);
        es = sbi->s_es;
+        if (sbi->s_log_groups_per_flex) {
+                ret2 = find_group_flex(sb, dir, &group);
+                goto got_group;
+        }
        if (S_ISDIR(mode)) {
                if (test_opt (sb, OLDALLOC))
                        ret2 = find_group_dir(sb, dir, &group);
@@ -522,6 +610,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
        } else
                ret2 = find_group_other(sb, dir, &group);
+got_group:
        err = -ENOSPC;
        if (ret2 == -1)
                goto out;
@@ -600,7 +689,7 @@ got:
        /* We may have to initialize the block bitmap if it isn't already */
        if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
            gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
-                struct buffer_head *block_bh = read_block_bitmap(sb, group);
+                struct buffer_head *block_bh = ext4_read_block_bitmap(sb, group);
                BUFFER_TRACE(block_bh, "get block bitmap access");
                err = ext4_journal_get_write_access(handle, block_bh);
@@ -676,6 +765,13 @@ got:
                percpu_counter_inc(&sbi->s_dirs_counter);
        sb->s_dirt = 1;
+        if (sbi->s_log_groups_per_flex) {
+                flex_group = ext4_flex_group(sbi, group);
+                spin_lock(sb_bgl_lock(sbi, flex_group));
+                sbi->s_flex_groups[flex_group].free_inodes--;
+                spin_unlock(sb_bgl_lock(sbi, flex_group));
+        }
        inode->i_uid = current->fsuid;
        if (test_opt (sb, GRPID))
                inode->i_gid = dir->i_gid;
@@ -740,14 +836,10 @@ got:
                goto fail_free_drop;
        if (test_opt(sb, EXTENTS)) {
-                /* set extent flag only for diretory, file and normal symlink*/
+                /* set extent flag only for directory, file and normal symlink*/
                if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
                        EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
                        ext4_ext_tree_init(handle, inode);
-                        err = ext4_update_incompat_feature(handle, sb,
-                                        EXT4_FEATURE_INCOMPAT_EXTENTS);
-                        if (err)
-                                goto fail_free_drop;
                }
        }
@@ -817,6 +909,14 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
        if (IS_ERR(inode))
                goto iget_failed;
+        /*
+         * If the orphans has i_nlinks > 0 then it should be able to be
+         * truncated, otherwise it won't be removed from the orphan list
+         * during processing and an infinite loop will result.
+         */
+        if (inode->i_nlink && !ext4_can_truncate(inode))
+                goto bad_orphan;
        if (NEXT_ORPHAN(inode) > max_ino)
                goto bad_orphan;
        brelse(bitmap_bh);
@@ -838,6 +938,7 @@ bad_orphan:
                printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
                       NEXT_ORPHAN(inode));
                printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
+                printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink);
                /* Avoid freeing blocks if we got a bad deleted inode */
                if (inode->i_nlink == 0)
                        inode->i_blocks = 0;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8d9707746413..8ca2763df091 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -32,12 +32,23 @@
 #include <linux/string.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
+#include <linux/pagevec.h>
 #include <linux/mpage.h>
 #include <linux/uio.h>
 #include <linux/bio.h>
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
+#include "ext4_extents.h"
+static inline int ext4_begin_ordered_truncate(struct inode *inode,
+                                              loff_t new_size)
+{
+        return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode,
+                                                   new_size);
+}
+static void ext4_invalidatepage(struct page *page, unsigned long offset);
 /*
 * Test whether an inode is a fast symlink.
@@ -181,6 +192,8 @@ void ext4_delete_inode (struct inode * inode)
 {
        handle_t *handle;
+        if (ext4_should_order_data(inode))
+                ext4_begin_ordered_truncate(inode, 0);
        truncate_inode_pages(&inode->i_data, 0);
        if (is_bad_inode(inode))
@@ -508,11 +521,12 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
 *              direct blocks
 */
 static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
-                        ext4_fsblk_t goal, int indirect_blks, int blks,
+                                ext4_lblk_t iblock, ext4_fsblk_t goal,
-                        ext4_fsblk_t new_blocks[4], int *err)
+                                int indirect_blks, int blks,
+                                ext4_fsblk_t new_blocks[4], int *err)
 {
        int target, i;
-        unsigned long count = 0;
+        unsigned long count = 0, blk_allocated = 0;
        int index = 0;
        ext4_fsblk_t current_block = 0;
        int ret = 0;
@@ -525,12 +539,13 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
         * the first direct block of this branch.  That's the
         * minimum number of blocks need to allocate(required)
         */
-        target = blks + indirect_blks;
+        /* first we try to allocate the indirect blocks */
+        target = indirect_blks;
-        while (1) {
+        while (target > 0) {
                count = target;
                /* allocating blocks for indirect blocks and direct blocks */
-                current_block = ext4_new_blocks(handle,inode,goal,&count,err);
+                current_block = ext4_new_meta_blocks(handle, inode,
+                                                        goal, &count, err);
                if (*err)
                        goto failed_out;
@@ -540,16 +555,48 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
                        new_blocks[index++] = current_block++;
                        count--;
                }
+                if (count > 0) {
-                if (count > 0)
+                        /*
+                         * save the new block number
+                         * for the first direct block
+                         */
+                        new_blocks[index] = current_block;
+                        printk(KERN_INFO "%s returned more blocks than "
+                                                "requested\n", __func__);
+                        WARN_ON(1);
                        break;
+                }
        }
-        /* save the new block number for the first direct block */
+        target = blks - count ;
-        new_blocks[index] = current_block;
+        blk_allocated = count;
+        if (!target)
+                goto allocated;
+        /* Now allocate data blocks */
+        count = target;
+        /* allocating blocks for data blocks */
+        current_block = ext4_new_blocks(handle, inode, iblock,
+                                                goal, &count, err);
+        if (*err && (target == blks)) {
+                /*
+                 * if the allocation failed and we didn't allocate
+                 * any blocks before
+                 */
+                goto failed_out;
+        }
+        if (!*err) {
+                if (target == blks) {
+                /*
+                 * save the new block number
+                 * for the first direct block
+                 */
+                        new_blocks[index] = current_block;
+                }
+                blk_allocated += count;
+        }
+allocated:
        /* total number of blocks allocated for direct blocks */
-        ret = count;
+        ret = blk_allocated;
        *err = 0;
        return ret;
 failed_out:
@@ -584,8 +631,9 @@ failed_out:
 *      as described above and return 0.
 */
 static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
-                        int indirect_blks, int *blks, ext4_fsblk_t goal,
+                                ext4_lblk_t iblock, int indirect_blks,
-                        ext4_lblk_t *offsets, Indirect *branch)
+                                int *blks, ext4_fsblk_t goal,
+                                ext4_lblk_t *offsets, Indirect *branch)
 {
        int blocksize = inode->i_sb->s_blocksize;
        int i, n = 0;
@@ -595,7 +643,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
        ext4_fsblk_t new_blocks[4];
        ext4_fsblk_t current_block;
-        num = ext4_alloc_blocks(handle, inode, goal, indirect_blks,
+        num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
                                *blks, new_blocks, &err);
        if (err)
                return err;
@@ -799,6 +847,7 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
        struct ext4_inode_info *ei = EXT4_I(inode);
        int count = 0;
        ext4_fsblk_t first_block = 0;
+        loff_t disksize;
        J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
@@ -855,8 +904,9 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
        /*
         * Block out ext4_truncate while we alter the tree
         */
-        err = ext4_alloc_branch(handle, inode, indirect_blks, &count, goal,
+        err = ext4_alloc_branch(handle, inode, iblock, indirect_blks,
-                                offsets + (partial - chain), partial);
+                                        &count, goal,
+                                        offsets + (partial - chain), partial);
        /*
         * The ext4_splice_branch call will free and forget any buffers
@@ -873,8 +923,13 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
         * protect it if you're about to implement concurrent
         * ext4_get_block() -bzzz
        */
-        if (!err && extend_disksize && inode->i_size > ei->i_disksize)
+        if (!err && extend_disksize) {
-                ei->i_disksize = inode->i_size;
+                disksize = ((loff_t) iblock + count) << inode->i_blkbits;
+                if (disksize > i_size_read(inode))
+                        disksize = i_size_read(inode);
+                if (disksize > ei->i_disksize)
+                        ei->i_disksize = disksize;
+        }
        if (err)
                goto cleanup;
@@ -934,7 +989,7 @@ out:
 */
 int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
                        unsigned long max_blocks, struct buffer_head *bh,
-                        int create, int extend_disksize)
+                        int create, int extend_disksize, int flag)
 {
        int retval;
@@ -975,6 +1030,15 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
         * with create == 1 flag.
         */
        down_write((&EXT4_I(inode)->i_data_sem));
+        /*
+         * if the caller is from delayed allocation writeout path
+         * we have already reserved fs blocks for allocation
+         * let the underlying get_block() function know to
+         * avoid double accounting
+         */
+        if (flag)
+                EXT4_I(inode)->i_delalloc_reserved_flag = 1;
        /*
         * We need to check for EXT4 here because migrate
         * could have changed the inode type in between
@@ -996,6 +1060,18 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
                                                        ~EXT4_EXT_MIGRATE;
                }
        }
+        if (flag) {
+                EXT4_I(inode)->i_delalloc_reserved_flag = 0;
+                /*
+                 * Update reserved blocks/metadata blocks
+                 * after successful block allocation
+                 * which were deferred till now
+                 */
+                if ((retval > 0) && buffer_delay(bh))
+                        ext4_da_release_space(inode, retval, 0);
+        }
        up_write((&EXT4_I(inode)->i_data_sem));
        return retval;
 }
@@ -1021,7 +1097,7 @@ static int ext4_get_block(struct inode *inode, sector_t iblock,
        }
        ret = ext4_get_blocks_wrap(handle, inode, iblock,
-                                        max_blocks, bh_result, create, 0);
+                                        max_blocks, bh_result, create, 0, 0);
        if (ret > 0) {
                bh_result->b_size = (ret << inode->i_blkbits);
                ret = 0;
@@ -1047,7 +1123,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
        dummy.b_blocknr = -1000;
        buffer_trace_init(&dummy.b_history);
        err = ext4_get_blocks_wrap(handle, inode, block, 1,
-                                        &dummy, create, 1);
+                                        &dummy, create, 1, 0);
        /*
         * ext4_get_blocks_handle() returns number of blocks
         * mapped. 0 in case of a HOLE.
@@ -1203,19 +1279,20 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
        to = from + len;
 retry:
-        page = __grab_cache_page(mapping, index);
-        if (!page)
-                return -ENOMEM;
-        *pagep = page;
        handle = ext4_journal_start(inode, needed_blocks);
        if (IS_ERR(handle)) {
-                unlock_page(page);
-                page_cache_release(page);
                ret = PTR_ERR(handle);
                goto out;
        }
+        page = __grab_cache_page(mapping, index);
+        if (!page) {
+                ext4_journal_stop(handle);
+                ret = -ENOMEM;
+                goto out;
+        }
+        *pagep = page;
        ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
                                                        ext4_get_block);
@@ -1225,8 +1302,8 @@ retry:
        }
        if (ret) {
-                ext4_journal_stop(handle);
                unlock_page(page);
+                ext4_journal_stop(handle);
                page_cache_release(page);
        }
@@ -1236,15 +1313,6 @@ out:
        return ret;
 }
-int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
-{
-        int err = jbd2_journal_dirty_data(handle, bh);
-        if (err)
-                ext4_journal_abort_handle(__func__, __func__,
-                                                bh, handle, err);
-        return err;
-}
 /* For write_end() in data=journal mode */
 static int write_end_fn(handle_t *handle, struct buffer_head *bh)
 {
@@ -1255,29 +1323,6 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
 }
 /*
- * Generic write_end handler for ordered and writeback ext4 journal modes.
- * We can't use generic_write_end, because that unlocks the page and we need to
- * unlock the page after ext4_journal_stop, but ext4_journal_stop must run
- * after block_write_end.
- */
-static int ext4_generic_write_end(struct file *file,
-                                struct address_space *mapping,
-                                loff_t pos, unsigned len, unsigned copied,
-                                struct page *page, void *fsdata)
-{
-        struct inode *inode = file->f_mapping->host;
-        copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
-        if (pos+copied > inode->i_size) {
-                i_size_write(inode, pos+copied);
-                mark_inode_dirty(inode);
-        }
-        return copied;
-}
-/*
 * We need to pick up the new inode size which generic_commit_write gave us
 * `file' can be NULL - eg, when called from page_symlink().
 *
@@ -1290,15 +1335,14 @@ static int ext4_ordered_write_end(struct file *file,
                                struct page *page, void *fsdata)
 {
        handle_t *handle = ext4_journal_current_handle();
-        struct inode *inode = file->f_mapping->host;
+        struct inode *inode = mapping->host;
        unsigned from, to;
        int ret = 0, ret2;
        from = pos & (PAGE_CACHE_SIZE - 1);
        to = from + len;
-        ret = walk_page_buffers(handle, page_buffers(page),
+        ret = ext4_jbd2_file_inode(handle, inode);
-                from, to, NULL, ext4_journal_dirty_data);
        if (ret == 0) {
                /*
@@ -1311,7 +1355,7 @@ static int ext4_ordered_write_end(struct file *file,
                new_i_size = pos + copied;
                if (new_i_size > EXT4_I(inode)->i_disksize)
                        EXT4_I(inode)->i_disksize = new_i_size;
-                ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
+                ret2 = generic_write_end(file, mapping, pos, len, copied,
                                                        page, fsdata);
                copied = ret2;
                if (ret2 < 0)
@@ -1320,8 +1364,6 @@ static int ext4_ordered_write_end(struct file *file,
        ret2 = ext4_journal_stop(handle);
        if (!ret)
                ret = ret2;
-        unlock_page(page);
-        page_cache_release(page);
        return ret ? ret : copied;
 }
@@ -1332,7 +1374,7 @@ static int ext4_writeback_write_end(struct file *file,
                                struct page *page, void *fsdata)
 {
        handle_t *handle = ext4_journal_current_handle();
-        struct inode *inode = file->f_mapping->host;
+        struct inode *inode = mapping->host;
        int ret = 0, ret2;
        loff_t new_i_size;
@@ -1340,7 +1382,7 @@ static int ext4_writeback_write_end(struct file *file,
        if (new_i_size > EXT4_I(inode)->i_disksize)
                EXT4_I(inode)->i_disksize = new_i_size;
-        ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
+        ret2 = generic_write_end(file, mapping, pos, len, copied,
                                                        page, fsdata);
        copied = ret2;
        if (ret2 < 0)
@@ -1349,8 +1391,6 @@ static int ext4_writeback_write_end(struct file *file,
        ret2 = ext4_journal_stop(handle);
        if (!ret)
                ret = ret2;
-        unlock_page(page);
-        page_cache_release(page);
        return ret ? ret : copied;
 }
@@ -1389,14 +1429,965 @@ static int ext4_journalled_write_end(struct file *file,
                        ret = ret2;
        }
+        unlock_page(page);
        ret2 = ext4_journal_stop(handle);
        if (!ret)
                ret = ret2;
-        unlock_page(page);
        page_cache_release(page);
        return ret ? ret : copied;
 }
+/*
+ * Calculate the number of metadata blocks need to reserve
+ * to allocate @blocks for non extent file based file
+ */
+static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)
+{
+        int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+        int ind_blks, dind_blks, tind_blks;
+        /* number of new indirect blocks needed */
+        ind_blks = (blocks + icap - 1) / icap;
+        dind_blks = (ind_blks + icap - 1) / icap;
+        tind_blks = 1;
+        return ind_blks + dind_blks + tind_blks;
+}
+/*
+ * Calculate the number of metadata blocks need to reserve
+ * to allocate given number of blocks
+ */
+static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
+{
+        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+                return ext4_ext_calc_metadata_amount(inode, blocks);
+        return ext4_indirect_calc_metadata_amount(inode, blocks);
+}
+static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+       unsigned long md_needed, mdblocks, total = 0;
+        /*
+         * recalculate the amount of metadata blocks to reserve
+         * in order to allocate nrblocks
+         * worse case is one extent per block
+         */
+        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+        total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
+        mdblocks = ext4_calc_metadata_amount(inode, total);
+        BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks);
+        md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
+        total = md_needed + nrblocks;
+        if (ext4_has_free_blocks(sbi, total) < total) {
+                spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+                return -ENOSPC;
+        }
+        /* reduce fs free blocks counter */
+        percpu_counter_sub(&sbi->s_freeblocks_counter, total);
+        EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
+        EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
+        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+        return 0;       /* success */
+}
+void ext4_da_release_space(struct inode *inode, int used, int to_free)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+        int total, mdb, mdb_free, release;
+        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+        /* recalculate the number of metablocks still need to be reserved */
+        total = EXT4_I(inode)->i_reserved_data_blocks - used - to_free;
+        mdb = ext4_calc_metadata_amount(inode, total);
+        /* figure out how many metablocks to release */
+        BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+        mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
+        /* Account for allocated meta_blocks */
+        mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
+        release = to_free + mdb_free;
+        /* update fs free blocks counter for truncate case */
+        percpu_counter_add(&sbi->s_freeblocks_counter, release);
+        /* update per-inode reservations */
+        BUG_ON(used + to_free > EXT4_I(inode)->i_reserved_data_blocks);
+        EXT4_I(inode)->i_reserved_data_blocks -= (used + to_free);
+        BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+        EXT4_I(inode)->i_reserved_meta_blocks = mdb;
+        EXT4_I(inode)->i_allocated_meta_blocks = 0;
+        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+}
+static void ext4_da_page_release_reservation(struct page *page,
+                                                unsigned long offset)
+{
+        int to_release = 0;
+        struct buffer_head *head, *bh;
+        unsigned int curr_off = 0;
+        head = page_buffers(page);
+        bh = head;
+        do {
+                unsigned int next_off = curr_off + bh->b_size;
+                if ((offset <= curr_off) && (buffer_delay(bh))) {
+                        to_release++;
+                        clear_buffer_delay(bh);
+                }
+                curr_off = next_off;
+        } while ((bh = bh->b_this_page) != head);
+        ext4_da_release_space(page->mapping->host, 0, to_release);
+}
+/*
+ * Delayed allocation stuff
+ */
+struct mpage_da_data {
+        struct inode *inode;
+        struct buffer_head lbh;                 /* extent of blocks */
+        unsigned long first_page, next_page;    /* extent of pages */
+        get_block_t *get_block;
+        struct writeback_control *wbc;
+};
+/*
+ * mpage_da_submit_io - walks through extent of pages and try to write
+ * them with __mpage_writepage()
+ *
+ * @mpd->inode: inode
+ * @mpd->first_page: first page of the extent
+ * @mpd->next_page: page after the last page of the extent
+ * @mpd->get_block: the filesystem's block mapper function
+ *
+ * By the time mpage_da_submit_io() is called we expect all blocks
+ * to be allocated. this may be wrong if allocation failed.
+ *
+ * As pages are already locked by write_cache_pages(), we can't use it
+ */
+static int mpage_da_submit_io(struct mpage_da_data *mpd)
+{
+        struct address_space *mapping = mpd->inode->i_mapping;
+        struct mpage_data mpd_pp = {
+                .bio = NULL,
+                .last_block_in_bio = 0,
+                .get_block = mpd->get_block,
+                .use_writepage = 1,
+        };
+        int ret = 0, err, nr_pages, i;
+        unsigned long index, end;
+        struct pagevec pvec;
+        BUG_ON(mpd->next_page <= mpd->first_page);
+        pagevec_init(&pvec, 0);
+        index = mpd->first_page;
+        end = mpd->next_page - 1;
+        while (index <= end) {
+                /* XXX: optimize tail */
+                nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+                if (nr_pages == 0)
+                        break;
+                for (i = 0; i < nr_pages; i++) {
+                        struct page *page = pvec.pages[i];
+                        index = page->index;
+                        if (index > end)
+                                break;
+                        index++;
+                        err = __mpage_writepage(page, mpd->wbc, &mpd_pp);
+                        /*
+                         * In error case, we have to continue because
+                         * remaining pages are still locked
+                         * XXX: unlock and re-dirty them?
+                         */
+                        if (ret == 0)
+                                ret = err;
+                }
+                pagevec_release(&pvec);
+        }
+        if (mpd_pp.bio)
+                mpage_bio_submit(WRITE, mpd_pp.bio);
+        return ret;
+}
+/*
+ * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
+ *
+ * @mpd->inode - inode to walk through
+ * @exbh->b_blocknr - first block on a disk
+ * @exbh->b_size - amount of space in bytes
+ * @logical - first logical block to start assignment with
+ *
+ * the function goes through all passed space and put actual disk
+ * block numbers into buffer heads, dropping BH_Delay
+ */
+static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
+                                 struct buffer_head *exbh)
+{
+        struct inode *inode = mpd->inode;
+        struct address_space *mapping = inode->i_mapping;
+        int blocks = exbh->b_size >> inode->i_blkbits;
+        sector_t pblock = exbh->b_blocknr, cur_logical;
+        struct buffer_head *head, *bh;
+        unsigned long index, end;
+        struct pagevec pvec;
+        int nr_pages, i;
+        index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        pagevec_init(&pvec, 0);
+        while (index <= end) {
+                /* XXX: optimize tail */
+                nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+                if (nr_pages == 0)
+                        break;
+                for (i = 0; i < nr_pages; i++) {
+                        struct page *page = pvec.pages[i];
+                        index = page->index;
+                        if (index > end)
+                                break;
+                        index++;
+                        BUG_ON(!PageLocked(page));
+                        BUG_ON(PageWriteback(page));
+                        BUG_ON(!page_has_buffers(page));
+                        bh = page_buffers(page);
+                        head = bh;
+                        /* skip blocks out of the range */
+                        do {
+                                if (cur_logical >= logical)
+                                        break;
+                                cur_logical++;
+                        } while ((bh = bh->b_this_page) != head);
+                        do {
+                                if (cur_logical >= logical + blocks)
+                                        break;
+                                if (buffer_delay(bh)) {
+                                        bh->b_blocknr = pblock;
+                                        clear_buffer_delay(bh);
+                                } else if (buffer_mapped(bh))
+                                        BUG_ON(bh->b_blocknr != pblock);
+                                cur_logical++;
+                                pblock++;
+                        } while ((bh = bh->b_this_page) != head);
+                }
+                pagevec_release(&pvec);
+        }
+}
+/*
+ * __unmap_underlying_blocks - just a helper function to unmap
+ * set of blocks described by @bh
+ */
+static inline void __unmap_underlying_blocks(struct inode *inode,
+                                             struct buffer_head *bh)
+{
+        struct block_device *bdev = inode->i_sb->s_bdev;
+        int blocks, i;
+        blocks = bh->b_size >> inode->i_blkbits;
+        for (i = 0; i < blocks; i++)
+                unmap_underlying_metadata(bdev, bh->b_blocknr + i);
+}
+/*
+ * mpage_da_map_blocks - go through given space
+ *
+ * @mpd->lbh - bh describing space
+ * @mpd->get_block - the filesystem's block mapper function
+ *
+ * The function skips space we know is already mapped to disk blocks.
+ *
+ * The function ignores errors ->get_block() returns, thus real
+ * error handling is postponed to __mpage_writepage()
+ */
+static void mpage_da_map_blocks(struct mpage_da_data *mpd)
+{
+        struct buffer_head *lbh = &mpd->lbh;
+        int err = 0, remain = lbh->b_size;
+        sector_t next = lbh->b_blocknr;
+        struct buffer_head new;
+        /*
+         * We consider only non-mapped and non-allocated blocks
+         */
+        if (buffer_mapped(lbh) && !buffer_delay(lbh))
+                return;
+        while (remain) {
+                new.b_state = lbh->b_state;
+                new.b_blocknr = 0;
+                new.b_size = remain;
+                err = mpd->get_block(mpd->inode, next, &new, 1);
+                if (err) {
+                        /*
+                         * Rather than implement own error handling
+                         * here, we just leave remaining blocks
+                         * unallocated and try again with ->writepage()
+                         */
+                        break;
+                }
+                BUG_ON(new.b_size == 0);
+                if (buffer_new(&new))
+                        __unmap_underlying_blocks(mpd->inode, &new);
+                /*
+                 * If blocks are delayed marked, we need to
+                 * put actual blocknr and drop delayed bit
+                 */
+                if (buffer_delay(lbh))
+                        mpage_put_bnr_to_bhs(mpd, next, &new);
+                /* go for the remaining blocks */
+                next += new.b_size >> mpd->inode->i_blkbits;
+                remain -= new.b_size;
+        }
+}
+#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay))
+/*
+ * mpage_add_bh_to_extent - try to add one more block to extent of blocks
+ *
+ * @mpd->lbh - extent of blocks
+ * @logical - logical number of the block in the file
+ * @bh - bh of the block (used to access block's state)
+ *
+ * the function is used to collect contig. blocks in same state
+ */
+static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
+                                   sector_t logical, struct buffer_head *bh)
+{
+        struct buffer_head *lbh = &mpd->lbh;
+        sector_t next;
+        next = lbh->b_blocknr + (lbh->b_size >> mpd->inode->i_blkbits);
+        /*
+         * First block in the extent
+         */
+        if (lbh->b_size == 0) {
+                lbh->b_blocknr = logical;
+                lbh->b_size = bh->b_size;
+                lbh->b_state = bh->b_state & BH_FLAGS;
+                return;
+        }
+        /*
+         * Can we merge the block to our big extent?
+         */
+        if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) {
+                lbh->b_size += bh->b_size;
+                return;
+        }
+        /*
+         * We couldn't merge the block to our extent, so we
+         * need to flush current  extent and start new one
+         */
+        mpage_da_map_blocks(mpd);
+        /*
+         * Now start a new extent
+         */
+        lbh->b_size = bh->b_size;
+        lbh->b_state = bh->b_state & BH_FLAGS;
+        lbh->b_blocknr = logical;
+}
+/*
+ * __mpage_da_writepage - finds extent of pages and blocks
+ *
+ * @page: page to consider
+ * @wbc: not used, we just follow rules
+ * @data: context
+ *
+ * The function finds extents of pages and scan them for all blocks.
+ */
+static int __mpage_da_writepage(struct page *page,
+                                struct writeback_control *wbc, void *data)
+{
+        struct mpage_da_data *mpd = data;
+        struct inode *inode = mpd->inode;
+        struct buffer_head *bh, *head, fake;
+        sector_t logical;
+        /*
+         * Can we merge this page to current extent?
+         */
+        if (mpd->next_page != page->index) {
+                /*
+                 * Nope, we can't. So, we map non-allocated blocks
+                 * and start IO on them using __mpage_writepage()
+                 */
+                if (mpd->next_page != mpd->first_page) {
+                        mpage_da_map_blocks(mpd);
+                        mpage_da_submit_io(mpd);
+                }
+                /*
+                 * Start next extent of pages ...
+                 */
+                mpd->first_page = page->index;
+                /*
+                 * ... and blocks
+                 */
+                mpd->lbh.b_size = 0;
+                mpd->lbh.b_state = 0;
+                mpd->lbh.b_blocknr = 0;
+        }
+        mpd->next_page = page->index + 1;
+        logical = (sector_t) page->index <<
+                  (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        if (!page_has_buffers(page)) {
+                /*
+                 * There is no attached buffer heads yet (mmap?)
+                 * we treat the page asfull of dirty blocks
+                 */
+                bh = &fake;
+                bh->b_size = PAGE_CACHE_SIZE;
+                bh->b_state = 0;
+                set_buffer_dirty(bh);
+                set_buffer_uptodate(bh);
+                mpage_add_bh_to_extent(mpd, logical, bh);
+        } else {
+                /*
+                 * Page with regular buffer heads, just add all dirty ones
+                 */
+                head = page_buffers(page);
+                bh = head;
+                do {
+                        BUG_ON(buffer_locked(bh));
+                        if (buffer_dirty(bh))
+                                mpage_add_bh_to_extent(mpd, logical, bh);
+                        logical++;
+                } while ((bh = bh->b_this_page) != head);
+        }
+        return 0;
+}
+/*
+ * mpage_da_writepages - walk the list of dirty pages of the given
+ * address space, allocates non-allocated blocks, maps newly-allocated
+ * blocks to existing bhs and issue IO them
+ *
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ * @get_block: the filesystem's block mapper function.
+ *
+ * This is a library function, which implements the writepages()
+ * address_space_operation.
+ *
+ * In order to avoid duplication of logic that deals with partial pages,
+ * multiple bio per page, etc, we find non-allocated blocks, allocate
+ * them with minimal calls to ->get_block() and re-use __mpage_writepage()
+ *
+ * It's important that we call __mpage_writepage() only once for each
+ * involved page, otherwise we'd have to implement more complicated logic
+ * to deal with pages w/o PG_lock or w/ PG_writeback and so on.
+ *
+ * See comments to mpage_writepages()
+ */
+static int mpage_da_writepages(struct address_space *mapping,
+                               struct writeback_control *wbc,
+                               get_block_t get_block)
+{
+        struct mpage_da_data mpd;
+        int ret;
+        if (!get_block)
+                return generic_writepages(mapping, wbc);
+        mpd.wbc = wbc;
+        mpd.inode = mapping->host;
+        mpd.lbh.b_size = 0;
+        mpd.lbh.b_state = 0;
+        mpd.lbh.b_blocknr = 0;
+        mpd.first_page = 0;
+        mpd.next_page = 0;
+        mpd.get_block = get_block;
+        ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);
+        /*
+         * Handle last extent of pages
+         */
+        if (mpd.next_page != mpd.first_page) {
+                mpage_da_map_blocks(&mpd);
+                mpage_da_submit_io(&mpd);
+        }
+        return ret;
+}
+/*
+ * this is a special callback for ->write_begin() only
+ * it's intention is to return mapped block or reserve space
+ */
+static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
+                                  struct buffer_head *bh_result, int create)
+{
+        int ret = 0;
+        BUG_ON(create == 0);
+        BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
+        /*
+         * first, we need to know whether the block is allocated already
+         * preallocated blocks are unmapped but should treated
+         * the same as allocated blocks.
+         */
+        ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1,  bh_result, 0, 0, 0);
+        if ((ret == 0) && !buffer_delay(bh_result)) {
+                /* the block isn't (pre)allocated yet, let's reserve space */
+                /*
+                 * XXX: __block_prepare_write() unmaps passed block,
+                 * is it OK?
+                 */
+                ret = ext4_da_reserve_space(inode, 1);
+                if (ret)
+                        /* not enough space to reserve */
+                        return ret;
+                map_bh(bh_result, inode->i_sb, 0);
+                set_buffer_new(bh_result);
+                set_buffer_delay(bh_result);
+        } else if (ret > 0) {
+                bh_result->b_size = (ret << inode->i_blkbits);
+                ret = 0;
+        }
+        return ret;
+}
+#define         EXT4_DELALLOC_RSVED     1
+static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
+                                   struct buffer_head *bh_result, int create)
+{
+        int ret;
+        unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+        loff_t disksize = EXT4_I(inode)->i_disksize;
+        handle_t *handle = NULL;
+        handle = ext4_journal_current_handle();
+        if (!handle) {
+                ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+                                   bh_result, 0, 0, 0);
+                BUG_ON(!ret);
+        } else {
+                ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+                                   bh_result, create, 0, EXT4_DELALLOC_RSVED);
+        }
+        if (ret > 0) {
+                bh_result->b_size = (ret << inode->i_blkbits);
+                /*
+                 * Update on-disk size along with block allocation
+                 * we don't use 'extend_disksize' as size may change
+                 * within already allocated block -bzzz
+                 */
+                disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
+                if (disksize > i_size_read(inode))
+                        disksize = i_size_read(inode);
+                if (disksize > EXT4_I(inode)->i_disksize) {
+                        /*
+                         * XXX: replace with spinlock if seen contended -bzzz
+                         */
+                        down_write(&EXT4_I(inode)->i_data_sem);
+                        if (disksize > EXT4_I(inode)->i_disksize)
+                                EXT4_I(inode)->i_disksize = disksize;
+                        up_write(&EXT4_I(inode)->i_data_sem);
+                        if (EXT4_I(inode)->i_disksize == disksize) {
+                                ret = ext4_mark_inode_dirty(handle, inode);
+                                return ret;
+                        }
+                }
+                ret = 0;
+        }
+        return ret;
+}
+static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
+{
+        /*
+         * unmapped buffer is possible for holes.
+         * delay buffer is possible with delayed allocation
+         */
+        return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh));
+}
+static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
+                                   struct buffer_head *bh_result, int create)
+{
+        int ret = 0;
+        unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+        /*
+         * we don't want to do block allocation in writepage
+         * so call get_block_wrap with create = 0
+         */
+        ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks,
+                                   bh_result, 0, 0, 0);
+        if (ret > 0) {
+                bh_result->b_size = (ret << inode->i_blkbits);
+                ret = 0;
+        }
+        return ret;
+}
+/*
+ * get called vi ext4_da_writepages after taking page lock (have journal handle)
+ * get called via journal_submit_inode_data_buffers (no journal handle)
+ * get called via shrink_page_list via pdflush (no journal handle)
+ * or grab_page_cache when doing write_begin (have journal handle)
+ */
+static int ext4_da_writepage(struct page *page,
+                                struct writeback_control *wbc)
+{
+        int ret = 0;
+        loff_t size;
+        unsigned long len;
+        struct buffer_head *page_bufs;
+        struct inode *inode = page->mapping->host;
+        size = i_size_read(inode);
+        if (page->index == size >> PAGE_CACHE_SHIFT)
+                len = size & ~PAGE_CACHE_MASK;
+        else
+                len = PAGE_CACHE_SIZE;
+        if (page_has_buffers(page)) {
+                page_bufs = page_buffers(page);
+                if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+                                        ext4_bh_unmapped_or_delay)) {
+                        /*
+                         * We don't want to do  block allocation
+                         * So redirty the page and return
+                         * We may reach here when we do a journal commit
+                         * via journal_submit_inode_data_buffers.
+                         * If we don't have mapping block we just ignore
+                         * them. We can also reach here via shrink_page_list
+                         */
+                        redirty_page_for_writepage(wbc, page);
+                        unlock_page(page);
+                        return 0;
+                }
+        } else {
+                /*
+                 * The test for page_has_buffers() is subtle:
+                 * We know the page is dirty but it lost buffers. That means
+                 * that at some moment in time after write_begin()/write_end()
+                 * has been called all buffers have been clean and thus they
+                 * must have been written at least once. So they are all
+                 * mapped and we can happily proceed with mapping them
+                 * and writing the page.
+                 *
+                 * Try to initialize the buffer_heads and check whether
+                 * all are mapped and non delay. We don't want to
+                 * do block allocation here.
+                 */
+                ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
+                                                ext4_normal_get_block_write);
+                if (!ret) {
+                        page_bufs = page_buffers(page);
+                        /* check whether all are mapped and non delay */
+                        if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+                                                ext4_bh_unmapped_or_delay)) {
+                                redirty_page_for_writepage(wbc, page);
+                                unlock_page(page);
+                                return 0;
+                        }
+                } else {
+                        /*
+                         * We can't do block allocation here
+                         * so just redity the page and unlock
+                         * and return
+                         */
+                        redirty_page_for_writepage(wbc, page);
+                        unlock_page(page);
+                        return 0;
+                }
+        }
+        if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
+                ret = nobh_writepage(page, ext4_normal_get_block_write, wbc);
+        else
+                ret = block_write_full_page(page,
+                                                ext4_normal_get_block_write,
+                                                wbc);
+        return ret;
+}
+/*
+ * For now just follow the DIO way to estimate the max credits
+ * needed to write out EXT4_MAX_WRITEBACK_PAGES.
+ * todo: need to calculate the max credits need for
+ * extent based files, currently the DIO credits is based on
+ * indirect-blocks mapping way.
+ *
+ * Probably should have a generic way to calculate credits
+ * for DIO, writepages, and truncate
+ */
+#define EXT4_MAX_WRITEBACK_PAGES      DIO_MAX_BLOCKS
+#define EXT4_MAX_WRITEBACK_CREDITS    DIO_CREDITS
+static int ext4_da_writepages(struct address_space *mapping,
+                                struct writeback_control *wbc)
+{
+        struct inode *inode = mapping->host;
+        handle_t *handle = NULL;
+        int needed_blocks;
+        int ret = 0;
+        long to_write;
+        loff_t range_start = 0;
+        /*
+         * No pages to write? This is mainly a kludge to avoid starting
+         * a transaction for special inodes like journal inode on last iput()
+         * because that could violate lock ordering on umount
+         */
+        if (!mapping->nrpages)
+                return 0;
+        /*
+         * Estimate the worse case needed credits to write out
+         * EXT4_MAX_BUF_BLOCKS pages
+         */
+        needed_blocks = EXT4_MAX_WRITEBACK_CREDITS;
+        to_write = wbc->nr_to_write;
+        if (!wbc->range_cyclic) {
+                /*
+                 * If range_cyclic is not set force range_cont
+                 * and save the old writeback_index
+                 */
+                wbc->range_cont = 1;
+                range_start =  wbc->range_start;
+        }
+        while (!ret && to_write) {
+                /* start a new transaction*/
+                handle = ext4_journal_start(inode, needed_blocks);
+                if (IS_ERR(handle)) {
+                        ret = PTR_ERR(handle);
+                        goto out_writepages;
+                }
+                if (ext4_should_order_data(inode)) {
+                        /*
+                         * With ordered mode we need to add
+                         * the inode to the journal handle
+                         * when we do block allocation.
+                         */
+                        ret = ext4_jbd2_file_inode(handle, inode);
+                        if (ret) {
+                                ext4_journal_stop(handle);
+                                goto out_writepages;
+                        }
+                }
+                /*
+                 * set the max dirty pages could be write at a time
+                 * to fit into the reserved transaction credits
+                 */
+                if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES)
+                        wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;
+                to_write -= wbc->nr_to_write;
+                ret = mpage_da_writepages(mapping, wbc,
+                                                ext4_da_get_block_write);
+                ext4_journal_stop(handle);
+                if (wbc->nr_to_write) {
+                        /*
+                         * There is no more writeout needed
+                         * or we requested for a noblocking writeout
+                         * and we found the device congested
+                         */
+                        to_write += wbc->nr_to_write;
+                        break;
+                }
+                wbc->nr_to_write = to_write;
+        }
+out_writepages:
+        wbc->nr_to_write = to_write;
+        if (range_start)
+                wbc->range_start = range_start;
+        return ret;
+}
+static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
+                                loff_t pos, unsigned len, unsigned flags,
+                                struct page **pagep, void **fsdata)
+{
+        int ret, retries = 0;
+        struct page *page;
+        pgoff_t index;
+        unsigned from, to;
+        struct inode *inode = mapping->host;
+        handle_t *handle;
+        index = pos >> PAGE_CACHE_SHIFT;
+        from = pos & (PAGE_CACHE_SIZE - 1);
+        to = from + len;
+retry:
+        /*
+         * With delayed allocation, we don't log the i_disksize update
+         * if there is delayed block allocation. But we still need
+         * to journalling the i_disksize update if writes to the end
+         * of file which has an already mapped buffer.
+         */
+        handle = ext4_journal_start(inode, 1);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                goto out;
+        }
+        page = __grab_cache_page(mapping, index);
+        if (!page)
+                return -ENOMEM;
+        *pagep = page;
+        ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+                                                        ext4_da_get_block_prep);
+        if (ret < 0) {
+                unlock_page(page);
+                ext4_journal_stop(handle);
+                page_cache_release(page);
+        }
+        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+                goto retry;
+out:
+        return ret;
+}
+/*
+ * Check if we should update i_disksize
+ * when write to the end of file but not require block allocation
+ */
+static int ext4_da_should_update_i_disksize(struct page *page,
+                                         unsigned long offset)
+{
+        struct buffer_head *bh;
+        struct inode *inode = page->mapping->host;
+        unsigned int idx;
+        int i;
+        bh = page_buffers(page);
+        idx = offset >> inode->i_blkbits;
+        for (i=0; i < idx; i++)
+                bh = bh->b_this_page;
+        if (!buffer_mapped(bh) || (buffer_delay(bh)))
+                return 0;
+        return 1;
+}
+static int ext4_da_write_end(struct file *file,
+                                struct address_space *mapping,
+                                loff_t pos, unsigned len, unsigned copied,
+                                struct page *page, void *fsdata)
+{
+        struct inode *inode = mapping->host;
+        int ret = 0, ret2;
+        handle_t *handle = ext4_journal_current_handle();
+        loff_t new_i_size;
+        unsigned long start, end;
+        start = pos & (PAGE_CACHE_SIZE - 1);
+        end = start + copied -1;
+        /*
+         * generic_write_end() will run mark_inode_dirty() if i_size
+         * changes.  So let's piggyback the i_disksize mark_inode_dirty
+         * into that.
+         */
+        new_i_size = pos + copied;
+        if (new_i_size > EXT4_I(inode)->i_disksize) {
+                if (ext4_da_should_update_i_disksize(page, end)) {
+                        down_write(&EXT4_I(inode)->i_data_sem);
+                        if (new_i_size > EXT4_I(inode)->i_disksize) {
+                                /*
+                                 * Updating i_disksize when extending file
+                                 * without needing block allocation
+                                 */
+                                if (ext4_should_order_data(inode))
+                                        ret = ext4_jbd2_file_inode(handle,
+                                                                   inode);
+                                EXT4_I(inode)->i_disksize = new_i_size;
+                        }
+                        up_write(&EXT4_I(inode)->i_data_sem);
+                }
+        }
+        ret2 = generic_write_end(file, mapping, pos, len, copied,
+                                                        page, fsdata);
+        copied = ret2;
+        if (ret2 < 0)
+                ret = ret2;
+        ret2 = ext4_journal_stop(handle);
+        if (!ret)
+                ret = ret2;
+        return ret ? ret : copied;
+}
+static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
+{
+        /*
+         * Drop reserved blocks
+         */
+        BUG_ON(!PageLocked(page));
+        if (!page_has_buffers(page))
+                goto out;
+        ext4_da_page_release_reservation(page, offset);
+out:
+        ext4_invalidatepage(page, offset);
+        return;
+}
 /*
 * bmap() is special.  It gets used by applications such as lilo and by
@@ -1418,6 +2409,16 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
        journal_t *journal;
        int err;
+        if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
+                        test_opt(inode->i_sb, DELALLOC)) {
+                /*
+                 * With delalloc we want to sync the file
+                 * so that we can make sure we allocate
+                 * blocks for file
+                 */
+                filemap_write_and_wait(mapping);
+        }
        if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
                /*
                 * This is a REALLY heavyweight approach, but the use of
@@ -1462,21 +2463,17 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
        return 0;
 }
-static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
-{
-        if (buffer_mapped(bh))
-                return ext4_journal_dirty_data(handle, bh);
-        return 0;
-}
 /*
- * Note that we always start a transaction even if we're not journalling
+ * Note that we don't need to start a transaction unless we're journaling data
- * data.  This is to preserve ordering: any hole instantiation within
+ * because we should have holes filled from ext4_page_mkwrite(). We even don't
- * __block_write_full_page -> ext4_get_block() should be journalled
+ * need to file the inode to the transaction's list in ordered mode because if
- * along with the data so we don't crash and then get metadata which
+ * we are writing back data added by write(), the inode is already there and if
- * refers to old data.
+ * we are writing back data modified via mmap(), noone guarantees in which
+ * transaction the data will hit the disk. In case we are journaling data, we
+ * cannot start transaction directly because transaction start ranks above page
+ * lock so we have to do some magic.
 *
- * In all journalling modes block_write_full_page() will start the I/O.
+ * In all journaling modes block_write_full_page() will start the I/O.
 *
 * Problem:
 *
@@ -1518,105 +2515,103 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
 * disastrous.  Any write() or metadata operation will sync the fs for
 * us.
 *
- * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
- * we don't need to open a transaction here.
 */
-static int ext4_ordered_writepage(struct page *page,
+static int __ext4_normal_writepage(struct page *page,
                                struct writeback_control *wbc)
 {
        struct inode *inode = page->mapping->host;
-        struct buffer_head *page_bufs;
-        handle_t *handle = NULL;
-        int ret = 0;
-        int err;
-        J_ASSERT(PageLocked(page));
-        /*
-         * We give up here if we're reentered, because it might be for a
-         * different filesystem.
-         */
-        if (ext4_journal_current_handle())
-                goto out_fail;
-        handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+        if (test_opt(inode->i_sb, NOBH))
+                return nobh_writepage(page,
+                                        ext4_normal_get_block_write, wbc);
+        else
+                return block_write_full_page(page,
+                                                ext4_normal_get_block_write,
+                                                wbc);
+}
-        if (IS_ERR(handle)) {
+static int ext4_normal_writepage(struct page *page,
-                ret = PTR_ERR(handle);
+                                struct writeback_control *wbc)
-                goto out_fail;
+{
-        }
+        struct inode *inode = page->mapping->host;
+        loff_t size = i_size_read(inode);
+        loff_t len;
-        if (!page_has_buffers(page)) {
+        J_ASSERT(PageLocked(page));
-                create_empty_buffers(page, inode->i_sb->s_blocksize,
+        if (page->index == size >> PAGE_CACHE_SHIFT)
-                                (1 << BH_Dirty)|(1 << BH_Uptodate));
+                len = size & ~PAGE_CACHE_MASK;
+        else
+                len = PAGE_CACHE_SIZE;
+        if (page_has_buffers(page)) {
+                /* if page has buffers it should all be mapped
+                 * and allocated. If there are not buffers attached
+                 * to the page we know the page is dirty but it lost
+                 * buffers. That means that at some moment in time
+                 * after write_begin() / write_end() has been called
+                 * all buffers have been clean and thus they must have been
+                 * written at least once. So they are all mapped and we can
+                 * happily proceed with mapping them and writing the page.
+                 */
+                BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+                                        ext4_bh_unmapped_or_delay));
        }
-        page_bufs = page_buffers(page);
-        walk_page_buffers(handle, page_bufs, 0,
-                        PAGE_CACHE_SIZE, NULL, bget_one);
-        ret = block_write_full_page(page, ext4_get_block, wbc);
-        /*
+        if (!ext4_journal_current_handle())
-         * The page can become unlocked at any point now, and
+                return __ext4_normal_writepage(page, wbc);
-         * truncate can then come in and change things.  So we
-         * can't touch *page from now on.  But *page_bufs is
-         * safe due to elevated refcount.
-         */
-        /*
-         * And attach them to the current transaction.  But only if
-         * block_write_full_page() succeeded.  Otherwise they are unmapped,
-         * and generally junk.
-         */
-        if (ret == 0) {
-                err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
-                                        NULL, jbd2_journal_dirty_data_fn);
-                if (!ret)
-                        ret = err;
-        }
-        walk_page_buffers(handle, page_bufs, 0,
-                        PAGE_CACHE_SIZE, NULL, bput_one);
-        err = ext4_journal_stop(handle);
-        if (!ret)
-                ret = err;
-        return ret;
-out_fail:
        redirty_page_for_writepage(wbc, page);
        unlock_page(page);
-        return ret;
+        return 0;
 }
-static int ext4_writeback_writepage(struct page *page,
+static int __ext4_journalled_writepage(struct page *page,
                                struct writeback_control *wbc)
 {
-        struct inode *inode = page->mapping->host;
+        struct address_space *mapping = page->mapping;
+        struct inode *inode = mapping->host;
+        struct buffer_head *page_bufs;
        handle_t *handle = NULL;
        int ret = 0;
        int err;
-        if (ext4_journal_current_handle())
+        ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
-                goto out_fail;
+                                        ext4_normal_get_block_write);
+        if (ret != 0)
+                goto out_unlock;
+        page_bufs = page_buffers(page);
+        walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
+                                                                bget_one);
+        /* As soon as we unlock the page, it can go away, but we have
+         * references to buffers so we are safe */
+        unlock_page(page);
        handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
-                goto out_fail;
+                goto out;
        }
-        if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
+        ret = walk_page_buffers(handle, page_bufs, 0,
-                ret = nobh_writepage(page, ext4_get_block, wbc);
+                        PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
-        else
-                ret = block_write_full_page(page, ext4_get_block, wbc);
+        err = walk_page_buffers(handle, page_bufs, 0,
+                                PAGE_CACHE_SIZE, NULL, write_end_fn);
+        if (ret == 0)
+                ret = err;
        err = ext4_journal_stop(handle);
        if (!ret)
                ret = err;
-        return ret;
-out_fail:
+        walk_page_buffers(handle, page_bufs, 0,
-        redirty_page_for_writepage(wbc, page);
+                                PAGE_CACHE_SIZE, NULL, bput_one);
+        EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
+        goto out;
+out_unlock:
        unlock_page(page);
+out:
        return ret;
 }
@@ -1624,59 +2619,53 @@ static int ext4_journalled_writepage(struct page *page,
                                struct writeback_control *wbc)
 {
        struct inode *inode = page->mapping->host;
-        handle_t *handle = NULL;
+        loff_t size = i_size_read(inode);
-        int ret = 0;
+        loff_t len;
-        int err;
-        if (ext4_journal_current_handle())
+        J_ASSERT(PageLocked(page));
-                goto no_write;
+        if (page->index == size >> PAGE_CACHE_SHIFT)
+                len = size & ~PAGE_CACHE_MASK;
+        else
+                len = PAGE_CACHE_SIZE;
+        if (page_has_buffers(page)) {
+                /* if page has buffers it should all be mapped
+                 * and allocated. If there are not buffers attached
+                 * to the page we know the page is dirty but it lost
+                 * buffers. That means that at some moment in time
+                 * after write_begin() / write_end() has been called
+                 * all buffers have been clean and thus they must have been
+                 * written at least once. So they are all mapped and we can
+                 * happily proceed with mapping them and writing the page.
+                 */
+                BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+                                        ext4_bh_unmapped_or_delay));
+        }
-        handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+        if (ext4_journal_current_handle())
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
                goto no_write;
-        }
-        if (!page_has_buffers(page) || PageChecked(page)) {
+        if (PageChecked(page)) {
                /*
                 * It's mmapped pagecache.  Add buffers and journal it.  There
                 * doesn't seem much point in redirtying the page here.
                 */
                ClearPageChecked(page);
-                ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
+                return __ext4_journalled_writepage(page, wbc);
-                                        ext4_get_block);
-                if (ret != 0) {
-                        ext4_journal_stop(handle);
-                        goto out_unlock;
-                }
-                ret = walk_page_buffers(handle, page_buffers(page), 0,
-                        PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
-                err = walk_page_buffers(handle, page_buffers(page), 0,
-                                PAGE_CACHE_SIZE, NULL, write_end_fn);
-                if (ret == 0)
-                        ret = err;
-                EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
-                unlock_page(page);
        } else {
                /*
                 * It may be a page full of checkpoint-mode buffers.  We don't
                 * really know unless we go poke around in the buffer_heads.
                 * But block_write_full_page will do the right thing.
                 */
-                ret = block_write_full_page(page, ext4_get_block, wbc);
+                return block_write_full_page(page,
+                                                ext4_normal_get_block_write,
+                                                wbc);
        }
-        err = ext4_journal_stop(handle);
-        if (!ret)
-                ret = err;
-out:
-        return ret;
 no_write:
        redirty_page_for_writepage(wbc, page);
-out_unlock:
        unlock_page(page);
-        goto out;
+        return 0;
 }
 static int ext4_readpage(struct file *file, struct page *page)
@@ -1819,7 +2808,7 @@ static int ext4_journalled_set_page_dirty(struct page *page)
 static const struct address_space_operations ext4_ordered_aops = {
        .readpage       = ext4_readpage,
        .readpages      = ext4_readpages,
-        .writepage      = ext4_ordered_writepage,
+        .writepage      = ext4_normal_writepage,
        .sync_page      = block_sync_page,
        .write_begin    = ext4_write_begin,
        .write_end      = ext4_ordered_write_end,
@@ -1833,7 +2822,7 @@ static const struct address_space_operations ext4_ordered_aops = {
 static const struct address_space_operations ext4_writeback_aops = {
        .readpage       = ext4_readpage,
        .readpages      = ext4_readpages,
-        .writepage      = ext4_writeback_writepage,
+        .writepage      = ext4_normal_writepage,
        .sync_page      = block_sync_page,
        .write_begin    = ext4_write_begin,
        .write_end      = ext4_writeback_write_end,
@@ -1857,10 +2846,31 @@ static const struct address_space_operations ext4_journalled_aops = {
        .releasepage    = ext4_releasepage,
 };
+static const struct address_space_operations ext4_da_aops = {
+        .readpage       = ext4_readpage,
+        .readpages      = ext4_readpages,
+        .writepage      = ext4_da_writepage,
+        .writepages     = ext4_da_writepages,
+        .sync_page      = block_sync_page,
+        .write_begin    = ext4_da_write_begin,
+        .write_end      = ext4_da_write_end,
+        .bmap           = ext4_bmap,
+        .invalidatepage = ext4_da_invalidatepage,
+        .releasepage    = ext4_releasepage,
+        .direct_IO      = ext4_direct_IO,
+        .migratepage    = buffer_migrate_page,
+};
 void ext4_set_aops(struct inode *inode)
 {
-        if (ext4_should_order_data(inode))
+        if (ext4_should_order_data(inode) &&
+                test_opt(inode->i_sb, DELALLOC))
+                inode->i_mapping->a_ops = &ext4_da_aops;
+        else if (ext4_should_order_data(inode))
                inode->i_mapping->a_ops = &ext4_ordered_aops;
+        else if (ext4_should_writeback_data(inode) &&
+                 test_opt(inode->i_sb, DELALLOC))
+                inode->i_mapping->a_ops = &ext4_da_aops;
        else if (ext4_should_writeback_data(inode))
                inode->i_mapping->a_ops = &ext4_writeback_aops;
        else
@@ -1873,7 +2883,7 @@ void ext4_set_aops(struct inode *inode)
 * This required during truncate. We need to physically zero the tail end
 * of that block so it doesn't yield old data if the file is later grown.
 */
-int ext4_block_truncate_page(handle_t *handle, struct page *page,
+int ext4_block_truncate_page(handle_t *handle,
                struct address_space *mapping, loff_t from)
 {
        ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
@@ -1882,8 +2892,13 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
        ext4_lblk_t iblock;
        struct inode *inode = mapping->host;
        struct buffer_head *bh;
+        struct page *page;
        int err = 0;
+        page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT);
+        if (!page)
+                return -EINVAL;
        blocksize = inode->i_sb->s_blocksize;
        length = blocksize - (offset & (blocksize - 1));
        iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
@@ -1956,7 +2971,7 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
                err = ext4_journal_dirty_metadata(handle, bh);
        } else {
                if (ext4_should_order_data(inode))
-                        err = ext4_journal_dirty_data(handle, bh);
+                        err = ext4_jbd2_file_inode(handle, inode);
                mark_buffer_dirty(bh);
        }
@@ -2179,7 +3194,21 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
        if (this_bh) {
                BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata");
-                ext4_journal_dirty_metadata(handle, this_bh);
+                /*
+                 * The buffer head should have an attached journal head at this
+                 * point. However, if the data is corrupted and an indirect
+                 * block pointed to itself, it would have been detached when
+                 * the block was cleared. Check for this instead of OOPSing.
+                 */
+                if (bh2jh(this_bh))
+                        ext4_journal_dirty_metadata(handle, this_bh);
+                else
+                        ext4_error(inode->i_sb, __func__,
+                                   "circular indirect block detected, "
+                                   "inode=%lu, block=%llu",
+                                   inode->i_ino,
+                                   (unsigned long long) this_bh->b_blocknr);
        }
 }
@@ -2305,6 +3334,19 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
        }
 }
+int ext4_can_truncate(struct inode *inode)
+{
+        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+                return 0;
+        if (S_ISREG(inode->i_mode))
+                return 1;
+        if (S_ISDIR(inode->i_mode))
+                return 1;
+        if (S_ISLNK(inode->i_mode))
+                return !ext4_inode_is_fast_symlink(inode);
+        return 0;
+}
 /*
 * ext4_truncate()
 *
@@ -2347,51 +3389,25 @@ void ext4_truncate(struct inode *inode)
        int n;
        ext4_lblk_t last_block;
        unsigned blocksize = inode->i_sb->s_blocksize;
-        struct page *page;
-        if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+        if (!ext4_can_truncate(inode))
-            S_ISLNK(inode->i_mode)))
-                return;
-        if (ext4_inode_is_fast_symlink(inode))
-                return;
-        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
                return;
-        /*
-         * We have to lock the EOF page here, because lock_page() nests
-         * outside jbd2_journal_start().
-         */
-        if ((inode->i_size & (blocksize - 1)) == 0) {
-                /* Block boundary? Nothing to do */
-                page = NULL;
-        } else {
-                page = grab_cache_page(mapping,
-                                inode->i_size >> PAGE_CACHE_SHIFT);
-                if (!page)
-                        return;
-        }
        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
-                ext4_ext_truncate(inode, page);
+                ext4_ext_truncate(inode);
                return;
        }
        handle = start_transaction(inode);
-        if (IS_ERR(handle)) {
+        if (IS_ERR(handle))
-                if (page) {
-                        clear_highpage(page);
-                        flush_dcache_page(page);
-                        unlock_page(page);
-                        page_cache_release(page);
-                }
                return;         /* AKPM: return what? */
-        }
        last_block = (inode->i_size + blocksize-1)
                                        >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
-        if (page)
+        if (inode->i_size & (blocksize - 1))
-                ext4_block_truncate_page(handle, page, mapping, inode->i_size);
+                if (ext4_block_truncate_page(handle, mapping, inode->i_size))
+                        goto out_stop;
        n = ext4_block_to_path(inode, last_block, offsets, NULL);
        if (n == 0)
@@ -2410,6 +3426,11 @@ void ext4_truncate(struct inode *inode)
                goto out_stop;
        /*
+         * From here we block out all ext4_get_block() callers who want to
+         * modify the block allocation tree.
+         */
+        down_write(&ei->i_data_sem);
+        /*
         * The orphan list entry will now protect us from any crash which
         * occurs before the truncate completes, so it is now safe to propagate
         * the new, shorter inode size (held for now in i_size) into the
@@ -2418,12 +3439,6 @@ void ext4_truncate(struct inode *inode)
         */
        ei->i_disksize = inode->i_size;
-        /*
-         * From here we block out all ext4_get_block() callers who want to
-         * modify the block allocation tree.
-         */
-        down_write(&ei->i_data_sem);
        if (n == 1) {           /* direct blocks */
                ext4_free_data(handle, inode, NULL, i_data+offsets[0],
                               i_data + EXT4_NDIR_BLOCKS);
@@ -3107,7 +4122,14 @@ int ext4_write_inode(struct inode *inode, int wait)
 * be freed, so we have a strong guarantee that no future commit will
 * leave these blocks visible to the user.)
 *
- * Called with inode->sem down.
+ * Another thing we have to assure is that if we are in ordered mode
+ * and inode is still attached to the committing transaction, we must
+ * we start writeout of all the dirty pages which are being truncated.
+ * This way we are sure that all the data written in the previous
+ * transaction are already on disk (truncate waits for pages under
+ * writeback).
+ *
+ * Called with inode->i_mutex down.
 */
 int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 {
@@ -3173,6 +4195,22 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                if (!error)
                        error = rc;
                ext4_journal_stop(handle);
+                if (ext4_should_order_data(inode)) {
+                        error = ext4_begin_ordered_truncate(inode,
+                                                            attr->ia_size);
+                        if (error) {
+                                /* Do as much error cleanup as possible */
+                                handle = ext4_journal_start(inode, 3);
+                                if (IS_ERR(handle)) {
+                                        ext4_orphan_del(NULL, inode);
+                                        goto err_out;
+                                }
+                                ext4_orphan_del(handle, inode);
+                                ext4_journal_stop(handle);
+                                goto err_out;
+                        }
+                }
        }
        rc = inode_setattr(inode, attr);
@@ -3193,6 +4231,32 @@ err_out:
        return error;
 }
+int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
+                 struct kstat *stat)
+{
+        struct inode *inode;
+        unsigned long delalloc_blocks;
+        inode = dentry->d_inode;
+        generic_fillattr(inode, stat);
+        /*
+         * We can't update i_blocks if the block allocation is delayed
+         * otherwise in the case of system crash before the real block
+         * allocation is done, we will have i_blocks inconsistent with
+         * on-disk file blocks.
+         * We always keep i_blocks updated together with real
+         * allocation. But to not confuse with user, stat
+         * will return the blocks that include the delayed allocation
+         * blocks for this file.
+         */
+        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+        delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
+        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+        stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
+        return 0;
+}
 /*
 * How many blocks doth make a writepage()?
@@ -3506,3 +4570,64 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
        return err;
 }
+static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
+{
+        return !buffer_mapped(bh);
+}
+int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+        loff_t size;
+        unsigned long len;
+        int ret = -EINVAL;
+        struct file *file = vma->vm_file;
+        struct inode *inode = file->f_path.dentry->d_inode;
+        struct address_space *mapping = inode->i_mapping;
+        /*
+         * Get i_alloc_sem to stop truncates messing with the inode. We cannot
+         * get i_mutex because we are already holding mmap_sem.
+         */
+        down_read(&inode->i_alloc_sem);
+        size = i_size_read(inode);
+        if (page->mapping != mapping || size <= page_offset(page)
+            || !PageUptodate(page)) {
+                /* page got truncated from under us? */
+                goto out_unlock;
+        }
+        ret = 0;
+        if (PageMappedToDisk(page))
+                goto out_unlock;
+        if (page->index == size >> PAGE_CACHE_SHIFT)
+                len = size & ~PAGE_CACHE_MASK;
+        else
+                len = PAGE_CACHE_SIZE;
+        if (page_has_buffers(page)) {
+                /* return if we have all the buffers mapped */
+                if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+                                       ext4_bh_unmapped))
+                        goto out_unlock;
+        }
+        /*
+         * OK, we need to fill the hole... Do write_begin write_end
+         * to do block allocation/reservation.We are not holding
+         * inode.i__mutex here. That allow * parallel write_begin,
+         * write_end call. lock_page prevent this from happening
+         * on the same page though
+         */
+        ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
+                        len, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
+        if (ret < 0)
+                goto out_unlock;
+        ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
+                        len, len, page, NULL);
+        if (ret < 0)
+                goto out_unlock;
+        ret = 0;
+out_unlock:
+        up_read(&inode->i_alloc_sem);
+        return ret;
+}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index c9900aade150..8d141a25bbee 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -381,22 +381,28 @@ static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
 static inline int mb_find_next_zero_bit(void *addr, int max, int start)
 {
-        int fix = 0;
+        int fix = 0, ret, tmpmax;
        addr = mb_correct_addr_and_bit(&fix, addr);
-        max += fix;
+        tmpmax = max + fix;
        start += fix;
-        return ext4_find_next_zero_bit(addr, max, start) - fix;
+        ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
+        if (ret > max)
+                return max;
+        return ret;
 }
 static inline int mb_find_next_bit(void *addr, int max, int start)
 {
-        int fix = 0;
+        int fix = 0, ret, tmpmax;
        addr = mb_correct_addr_and_bit(&fix, addr);
-        max += fix;
+        tmpmax = max + fix;
        start += fix;
-        return ext4_find_next_bit(addr, max, start) - fix;
+        ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
+        if (ret > max)
+                return max;
+        return ret;
 }
 static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
@@ -803,6 +809,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
                if (!buffer_uptodate(bh[i]))
                        goto out;
+        err = 0;
        first_block = page->index * blocks_per_page;
        for (i = 0; i < blocks_per_page; i++) {
                int group;
@@ -883,6 +890,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
        int pnum;
        int poff;
        struct page *page;
+        int ret;
        mb_debug("load group %lu\n", group);
@@ -914,15 +922,21 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
                if (page) {
                        BUG_ON(page->mapping != inode->i_mapping);
                        if (!PageUptodate(page)) {
-                                ext4_mb_init_cache(page, NULL);
+                                ret = ext4_mb_init_cache(page, NULL);
+                                if (ret) {
+                                        unlock_page(page);
+                                        goto err;
+                                }
                                mb_cmp_bitmaps(e4b, page_address(page) +
                                               (poff * sb->s_blocksize));
                        }
                        unlock_page(page);
                }
        }
-        if (page == NULL || !PageUptodate(page))
+        if (page == NULL || !PageUptodate(page)) {
+                ret = -EIO;
                goto err;
+        }
        e4b->bd_bitmap_page = page;
        e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
        mark_page_accessed(page);
@@ -938,14 +952,20 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
                page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
                if (page) {
                        BUG_ON(page->mapping != inode->i_mapping);
-                        if (!PageUptodate(page))
+                        if (!PageUptodate(page)) {
-                                ext4_mb_init_cache(page, e4b->bd_bitmap);
+                                ret = ext4_mb_init_cache(page, e4b->bd_bitmap);
+                                if (ret) {
+                                        unlock_page(page);
+                                        goto err;
+                                }
+                        }
                        unlock_page(page);
                }
        }
-        if (page == NULL || !PageUptodate(page))
+        if (page == NULL || !PageUptodate(page)) {
+                ret = -EIO;
                goto err;
+        }
        e4b->bd_buddy_page = page;
        e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
        mark_page_accessed(page);
@@ -962,7 +982,7 @@ err:
                page_cache_release(e4b->bd_buddy_page);
        e4b->bd_buddy = NULL;
        e4b->bd_bitmap = NULL;
-        return -EIO;
+        return ret;
 }
 static void ext4_mb_release_desc(struct ext4_buddy *e4b)
@@ -1031,7 +1051,7 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
        }
 }
-static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
+static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
                          int first, int count)
 {
        int block = 0;
@@ -1071,11 +1091,12 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
                        blocknr += block;
                        blocknr +=
                            le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
+                        ext4_unlock_group(sb, e4b->bd_group);
                        ext4_error(sb, __func__, "double-free of inode"
                                   " %lu's block %llu(bit %u in group %lu)\n",
                                   inode ? inode->i_ino : 0, blocknr, block,
                                   e4b->bd_group);
+                        ext4_lock_group(sb, e4b->bd_group);
                }
                mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
                e4b->bd_info->bb_counters[order]++;
@@ -1113,8 +1134,6 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
                } while (1);
        }
        mb_check_buddy(e4b);
-        return 0;
 }
 static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
@@ -1730,10 +1749,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
                ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
                spin_unlock(&sbi->s_md_lock);
        }
-        /* searching for the right group start from the goal value specified */
-        group = ac->ac_g_ex.fe_group;
        /* Let's just scan groups to find more-less suitable blocks */
        cr = ac->ac_2order ? 0 : 1;
        /*
@@ -1743,6 +1758,12 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 repeat:
        for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
                ac->ac_criteria = cr;
+                /*
+                 * searching for the right group start
+                 * from the goal value specified
+                 */
+                group = ac->ac_g_ex.fe_group;
                for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) {
                        struct ext4_group_info *grp;
                        struct ext4_group_desc *desc;
@@ -1963,6 +1984,8 @@ static int ext4_mb_seq_history_open(struct inode *inode, struct file *file)
        int rc;
        int size;
+        if (unlikely(sbi->s_mb_history == NULL))
+                return -ENOMEM;
        s = kmalloc(sizeof(*s), GFP_KERNEL);
        if (s == NULL)
                return -ENOMEM;
@@ -2165,9 +2188,7 @@ static void ext4_mb_history_init(struct super_block *sb)
        sbi->s_mb_history_cur = 0;
        spin_lock_init(&sbi->s_mb_history_lock);
        i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
-        sbi->s_mb_history = kmalloc(i, GFP_KERNEL);
+        sbi->s_mb_history = kzalloc(i, GFP_KERNEL);
-        if (likely(sbi->s_mb_history != NULL))
-                memset(sbi->s_mb_history, 0, i);
        /* if we can't allocate history, then we simple won't use it */
 }
@@ -2215,21 +2236,192 @@ ext4_mb_store_history(struct ext4_allocation_context *ac)
 #define ext4_mb_history_init(sb)
 #endif
+/* Create and initialize ext4_group_info data for the given group. */
+int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
+                          struct ext4_group_desc *desc)
+{
+        int i, len;
+        int metalen = 0;
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_group_info **meta_group_info;
+        /*
+         * First check if this group is the first of a reserved block.
+         * If it's true, we have to allocate a new table of pointers
+         * to ext4_group_info structures
+         */
+        if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
+                metalen = sizeof(*meta_group_info) <<
+                        EXT4_DESC_PER_BLOCK_BITS(sb);
+                meta_group_info = kmalloc(metalen, GFP_KERNEL);
+                if (meta_group_info == NULL) {
+                        printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
+                               "buddy group\n");
+                        goto exit_meta_group_info;
+                }
+                sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
+                        meta_group_info;
+        }
+        /*
+         * calculate needed size. if change bb_counters size,
+         * don't forget about ext4_mb_generate_buddy()
+         */
+        len = offsetof(typeof(**meta_group_info),
+                       bb_counters[sb->s_blocksize_bits + 2]);
+        meta_group_info =
+                sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
+        i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
+        meta_group_info[i] = kzalloc(len, GFP_KERNEL);
+        if (meta_group_info[i] == NULL) {
+                printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
+                goto exit_group_info;
+        }
+        set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
+                &(meta_group_info[i]->bb_state));
+        /*
+         * initialize bb_free to be able to skip
+         * empty groups without initialization
+         */
+        if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+                meta_group_info[i]->bb_free =
+                        ext4_free_blocks_after_init(sb, group, desc);
+        } else {
+                meta_group_info[i]->bb_free =
+                        le16_to_cpu(desc->bg_free_blocks_count);
+        }
+        INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
+#ifdef DOUBLE_CHECK
+        {
+                struct buffer_head *bh;
+                meta_group_info[i]->bb_bitmap =
+                        kmalloc(sb->s_blocksize, GFP_KERNEL);
+                BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
+                bh = ext4_read_block_bitmap(sb, group);
+                BUG_ON(bh == NULL);
+                memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
+                        sb->s_blocksize);
+                put_bh(bh);
+        }
+#endif
+        return 0;
+exit_group_info:
+        /* If a meta_group_info table has been allocated, release it now */
+        if (group % EXT4_DESC_PER_BLOCK(sb) == 0)
+                kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
+exit_meta_group_info:
+        return -ENOMEM;
+} /* ext4_mb_add_groupinfo */
+/*
+ * Add a group to the existing groups.
+ * This function is used for online resize
+ */
+int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group,
+                               struct ext4_group_desc *desc)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct inode *inode = sbi->s_buddy_cache;
+        int blocks_per_page;
+        int block;
+        int pnum;
+        struct page *page;
+        int err;
+        /* Add group based on group descriptor*/
+        err = ext4_mb_add_groupinfo(sb, group, desc);
+        if (err)
+                return err;
+        /*
+         * Cache pages containing dynamic mb_alloc datas (buddy and bitmap
+         * datas) are set not up to date so that they will be re-initilaized
+         * during the next call to ext4_mb_load_buddy
+         */
+        /* Set buddy page as not up to date */
+        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+        block = group * 2;
+        pnum = block / blocks_per_page;
+        page = find_get_page(inode->i_mapping, pnum);
+        if (page != NULL) {
+                ClearPageUptodate(page);
+                page_cache_release(page);
+        }
+        /* Set bitmap page as not up to date */
+        block++;
+        pnum = block / blocks_per_page;
+        page = find_get_page(inode->i_mapping, pnum);
+        if (page != NULL) {
+                ClearPageUptodate(page);
+                page_cache_release(page);
+        }
+        return 0;
+}
+/*
+ * Update an existing group.
+ * This function is used for online resize
+ */
+void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)
+{
+        grp->bb_free += add;
+}
 static int ext4_mb_init_backend(struct super_block *sb)
 {
        ext4_group_t i;
-        int j, len, metalen;
+        int metalen;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        int num_meta_group_infos =
+        struct ext4_super_block *es = sbi->s_es;
-                (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) >>
+        int num_meta_group_infos;
-                        EXT4_DESC_PER_BLOCK_BITS(sb);
+        int num_meta_group_infos_max;
+        int array_size;
        struct ext4_group_info **meta_group_info;
+        struct ext4_group_desc *desc;
+        /* This is the number of blocks used by GDT */
+        num_meta_group_infos = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) -
+                                1) >> EXT4_DESC_PER_BLOCK_BITS(sb);
+        /*
+         * This is the total number of blocks used by GDT including
+         * the number of reserved blocks for GDT.
+         * The s_group_info array is allocated with this value
+         * to allow a clean online resize without a complex
+         * manipulation of pointer.
+         * The drawback is the unused memory when no resize
+         * occurs but it's very low in terms of pages
+         * (see comments below)
+         * Need to handle this properly when META_BG resizing is allowed
+         */
+        num_meta_group_infos_max = num_meta_group_infos +
+                                le16_to_cpu(es->s_reserved_gdt_blocks);
+        /*
+         * array_size is the size of s_group_info array. We round it
+         * to the next power of two because this approximation is done
+         * internally by kmalloc so we can have some more memory
+         * for free here (e.g. may be used for META_BG resize).
+         */
+        array_size = 1;
+        while (array_size < sizeof(*sbi->s_group_info) *
+               num_meta_group_infos_max)
+                array_size = array_size << 1;
        /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
         * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
         * So a two level scheme suffices for now. */
-        sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) *
+        sbi->s_group_info = kmalloc(array_size, GFP_KERNEL);
-                                    num_meta_group_infos, GFP_KERNEL);
        if (sbi->s_group_info == NULL) {
                printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
                return -ENOMEM;
@@ -2256,63 +2448,15 @@ static int ext4_mb_init_backend(struct super_block *sb)
                sbi->s_group_info[i] = meta_group_info;
        }
-        /*
-         * calculate needed size. if change bb_counters size,
-         * don't forget about ext4_mb_generate_buddy()
-         */
-        len = sizeof(struct ext4_group_info);
-        len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2);
        for (i = 0; i < sbi->s_groups_count; i++) {
-                struct ext4_group_desc *desc;
-                meta_group_info =
-                        sbi->s_group_info[i >> EXT4_DESC_PER_BLOCK_BITS(sb)];
-                j = i & (EXT4_DESC_PER_BLOCK(sb) - 1);
-                meta_group_info[j] = kzalloc(len, GFP_KERNEL);
-                if (meta_group_info[j] == NULL) {
-                        printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
-                        goto err_freebuddy;
-                }
                desc = ext4_get_group_desc(sb, i, NULL);
                if (desc == NULL) {
                        printk(KERN_ERR
                                "EXT4-fs: can't read descriptor %lu\n", i);
-                        i++;
                        goto err_freebuddy;
                }
-                memset(meta_group_info[j], 0, len);
+                if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
-                set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
+                        goto err_freebuddy;
-                        &(meta_group_info[j]->bb_state));
-                /*
-                 * initialize bb_free to be able to skip
-                 * empty groups without initialization
-                 */
-                if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
-                        meta_group_info[j]->bb_free =
-                                ext4_free_blocks_after_init(sb, i, desc);
-                } else {
-                        meta_group_info[j]->bb_free =
-                                le16_to_cpu(desc->bg_free_blocks_count);
-                }
-                INIT_LIST_HEAD(&meta_group_info[j]->bb_prealloc_list);
-#ifdef DOUBLE_CHECK
-                {
-                        struct buffer_head *bh;
-                        meta_group_info[j]->bb_bitmap =
-                                kmalloc(sb->s_blocksize, GFP_KERNEL);
-                        BUG_ON(meta_group_info[j]->bb_bitmap == NULL);
-                        bh = read_block_bitmap(sb, i);
-                        BUG_ON(bh == NULL);
-                        memcpy(meta_group_info[j]->bb_bitmap, bh->b_data,
-                                        sb->s_blocksize);
-                        put_bh(bh);
-                }
-#endif
        }
        return 0;
@@ -2336,6 +2480,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        unsigned i;
        unsigned offset;
        unsigned max;
+        int ret;
        if (!test_opt(sb, MBALLOC))
                return 0;
@@ -2370,12 +2515,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        } while (i <= sb->s_blocksize_bits + 1);
        /* init file for buddy data */
-        i = ext4_mb_init_backend(sb);
+        ret = ext4_mb_init_backend(sb);
-        if (i) {
+        if (ret != 0) {
                clear_opt(sbi->s_mount_opt, MBALLOC);
                kfree(sbi->s_mb_offsets);
                kfree(sbi->s_mb_maxs);
-                return i;
+                return ret;
        }
        spin_lock_init(&sbi->s_md_lock);
@@ -2548,8 +2693,7 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
                ext4_lock_group(sb, md->group);
                for (i = 0; i < md->num; i++) {
                        mb_debug(" %u", md->blocks[i]);
-                        err = mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
+                        mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
-                        BUG_ON(err != 0);
                }
                mb_debug("\n");
                ext4_unlock_group(sb, md->group);
@@ -2575,25 +2719,24 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
-#define MB_PROC_VALUE_READ(name)                                \
+#define MB_PROC_FOPS(name)                                      \
-static int ext4_mb_read_##name(char *page, char **start,        \
+static int ext4_mb_##name##_proc_show(struct seq_file *m, void *v)      \
-                off_t off, int count, int *eof, void *data)     \
 {                                                               \
-        struct ext4_sb_info *sbi = data;                        \
+        struct ext4_sb_info *sbi = m->private;                  \
-        int len;                                                \
+                                                                \
-        *eof = 1;                                               \
+        seq_printf(m, "%ld\n", sbi->s_mb_##name);               \
-        if (off != 0)                                           \
+        return 0;                                               \
-                return 0;                                       \
+}                                                               \
-        len = sprintf(page, "%ld\n", sbi->s_mb_##name);         \
+                                                                \
-        *start = page;                                          \
+static int ext4_mb_##name##_proc_open(struct inode *inode, struct file *file)\
-        return len;                                             \
+{                                                               \
-}
+        return single_open(file, ext4_mb_##name##_proc_show, PDE(inode)->data);\
+}                                                               \
-#define MB_PROC_VALUE_WRITE(name)                               \
+                                                                \
-static int ext4_mb_write_##name(struct file *file,              \
+static ssize_t ext4_mb_##name##_proc_write(struct file *file,   \
-                const char __user *buf, unsigned long cnt, void *data)  \
+                const char __user *buf, size_t cnt, loff_t *ppos)       \
 {                                                               \
-        struct ext4_sb_info *sbi = data;                        \
+        struct ext4_sb_info *sbi = PDE(file->f_path.dentry->d_inode)->data;\
        char str[32];                                           \
        long value;                                             \
        if (cnt >= sizeof(str))                                 \
@@ -2605,31 +2748,32 @@ static int ext4_mb_write_##name(struct file *file,		\
                return -ERANGE;                                 \
        sbi->s_mb_##name = value;                               \
        return cnt;                                             \
-}
+}                                                               \
+                                                                \
+static const struct file_operations ext4_mb_##name##_proc_fops = {      \
+        .owner          = THIS_MODULE,                          \
+        .open           = ext4_mb_##name##_proc_open,           \
+        .read           = seq_read,                             \
+        .llseek         = seq_lseek,                            \
+        .release        = single_release,                       \
+        .write          = ext4_mb_##name##_proc_write,          \
+};
-MB_PROC_VALUE_READ(stats);
+MB_PROC_FOPS(stats);
-MB_PROC_VALUE_WRITE(stats);
+MB_PROC_FOPS(max_to_scan);
-MB_PROC_VALUE_READ(max_to_scan);
+MB_PROC_FOPS(min_to_scan);
-MB_PROC_VALUE_WRITE(max_to_scan);
+MB_PROC_FOPS(order2_reqs);
-MB_PROC_VALUE_READ(min_to_scan);
+MB_PROC_FOPS(stream_request);
-MB_PROC_VALUE_WRITE(min_to_scan);
+MB_PROC_FOPS(group_prealloc);
-MB_PROC_VALUE_READ(order2_reqs);
-MB_PROC_VALUE_WRITE(order2_reqs);
-MB_PROC_VALUE_READ(stream_request);
-MB_PROC_VALUE_WRITE(stream_request);
-MB_PROC_VALUE_READ(group_prealloc);
-MB_PROC_VALUE_WRITE(group_prealloc);
 #define MB_PROC_HANDLER(name, var)                                      \
 do {                                                                    \
-        proc = create_proc_entry(name, mode, sbi->s_mb_proc);           \
+        proc = proc_create_data(name, mode, sbi->s_mb_proc,             \
+                                &ext4_mb_##var##_proc_fops, sbi);       \
        if (proc == NULL) {                                             \
                printk(KERN_ERR "EXT4-fs: can't to create %s\n", name); \
                goto err_out;                                           \
        }                                                               \
-        proc->data = sbi;                                               \
-        proc->read_proc  = ext4_mb_read_##var ;                         \
-        proc->write_proc = ext4_mb_write_##var;                         \
 } while (0)
 static int ext4_mb_init_per_dev_proc(struct super_block *sb)
@@ -2639,6 +2783,10 @@ static int ext4_mb_init_per_dev_proc(struct super_block *sb)
        struct proc_dir_entry *proc;
        char devname[64];
+        if (proc_root_ext4 == NULL) {
+                sbi->s_mb_proc = NULL;
+                return -EINVAL;
+        }
        bdevname(sb->s_bdev, devname);
        sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4);
@@ -2747,7 +2895,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
        err = -EIO;
-        bitmap_bh = read_block_bitmap(sb, ac->ac_b_ex.fe_group);
+        bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
        if (!bitmap_bh)
                goto out_err;
@@ -2816,7 +2964,23 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
        le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len);
        gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
        spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
-        percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
+        /*
+         * free blocks account has already be reduced/reserved
+         * at write_begin() time for delayed allocation
+         * do not double accounting
+         */
+        if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
+                percpu_counter_sub(&sbi->s_freeblocks_counter,
+                                        ac->ac_b_ex.fe_len);
+        if (sbi->s_log_groups_per_flex) {
+                ext4_group_t flex_group = ext4_flex_group(sbi,
+                                                          ac->ac_b_ex.fe_group);
+                spin_lock(sb_bgl_lock(sbi, flex_group));
+                sbi->s_flex_groups[flex_group].free_blocks -= ac->ac_b_ex.fe_len;
+                spin_unlock(sb_bgl_lock(sbi, flex_group));
+        }
        err = ext4_journal_dirty_metadata(handle, bitmap_bh);
        if (err)
@@ -3473,8 +3637,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
                if (bit >= end)
                        break;
                next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
-                if (next > end)
-                        next = end;
                start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
                                le32_to_cpu(sbi->s_es->s_first_data_block);
                mb_debug("    free preallocated %u/%u in group %u\n",
@@ -3569,7 +3731,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
        if (list_empty(&grp->bb_prealloc_list))
                return 0;
-        bitmap_bh = read_block_bitmap(sb, group);
+        bitmap_bh = ext4_read_block_bitmap(sb, group);
        if (bitmap_bh == NULL) {
                /* error handling here */
                ext4_mb_release_desc(&e4b);
@@ -3743,7 +3905,7 @@ repeat:
                err = ext4_mb_load_buddy(sb, group, &e4b);
                BUG_ON(err != 0); /* error handling here */
-                bitmap_bh = read_block_bitmap(sb, group);
+                bitmap_bh = ext4_read_block_bitmap(sb, group);
                if (bitmap_bh == NULL) {
                        /* error handling here */
                        ext4_mb_release_desc(&e4b);
@@ -4011,10 +4173,21 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
        sbi = EXT4_SB(sb);
        if (!test_opt(sb, MBALLOC)) {
-                block = ext4_new_blocks_old(handle, ar->inode, ar->goal,
+                block = ext4_old_new_blocks(handle, ar->inode, ar->goal,
                                            &(ar->len), errp);
                return block;
        }
+        if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) {
+                /*
+                 * With delalloc we already reserved the blocks
+                 */
+                ar->len = ext4_has_free_blocks(sbi, ar->len);
+        }
+        if (ar->len == 0) {
+                *errp = -ENOSPC;
+                return 0;
+        }
        while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
                ar->flags |= EXT4_MB_HINT_NOPREALLOC;
@@ -4026,10 +4199,14 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
        }
        inquota = ar->len;
+        if (EXT4_I(ar->inode)->i_delalloc_reserved_flag)
+                ar->flags |= EXT4_MB_DELALLOC_RESERVED;
        ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
        if (!ac) {
+                ar->len = 0;
                *errp = -ENOMEM;
-                return 0;
+                goto out1;
        }
        ext4_mb_poll_new_transaction(sb, handle);
@@ -4037,12 +4214,11 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
        *errp = ext4_mb_initialize_context(ac, ar);
        if (*errp) {
                ar->len = 0;
-                goto out;
+                goto out2;
        }
        ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
        if (!ext4_mb_use_preallocated(ac)) {
                ac->ac_op = EXT4_MB_HISTORY_ALLOC;
                ext4_mb_normalize_request(ac, ar);
 repeat:
@@ -4085,11 +4261,12 @@ repeat:
        ext4_mb_release_context(ac);
-out:
+out2:
+        kmem_cache_free(ext4_ac_cachep, ac);
+out1:
        if (ar->len < inquota)
                DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len);
-        kmem_cache_free(ext4_ac_cachep, ac);
        return block;
 }
 static void ext4_mb_poll_new_transaction(struct super_block *sb,
@@ -4242,7 +4419,7 @@ do_more:
                overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
                count -= overflow;
        }
-        bitmap_bh = read_block_bitmap(sb, block_group);
+        bitmap_bh = ext4_read_block_bitmap(sb, block_group);
        if (!bitmap_bh)
                goto error_return;
        gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
@@ -4309,10 +4486,9 @@ do_more:
                ext4_mb_free_metadata(handle, &e4b, block_group, bit, count);
        } else {
                ext4_lock_group(sb, block_group);
-                err = mb_free_blocks(inode, &e4b, bit, count);
+                mb_free_blocks(inode, &e4b, bit, count);
                ext4_mb_return_to_preallocation(inode, &e4b, block, count);
                ext4_unlock_group(sb, block_group);
-                BUG_ON(err != 0);
        }
        spin_lock(sb_bgl_lock(sbi, block_group));
@@ -4321,6 +4497,13 @@ do_more:
        spin_unlock(sb_bgl_lock(sbi, block_group));
        percpu_counter_add(&sbi->s_freeblocks_counter, count);
+        if (sbi->s_log_groups_per_flex) {
+                ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+                spin_lock(sb_bgl_lock(sbi, flex_group));
+                sbi->s_flex_groups[flex_group].free_blocks += count;
+                spin_unlock(sb_bgl_lock(sbi, flex_group));
+        }
        ext4_mb_release_desc(&e4b);
        *freed += count;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index ab16beaa830d..387ad98350c3 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -183,6 +183,16 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                             struct inode *inode);
 /*
+ * p is at least 6 bytes before the end of page
+ */
+static inline struct ext4_dir_entry_2 *
+ext4_next_entry(struct ext4_dir_entry_2 *p)
+{
+        return (struct ext4_dir_entry_2 *)((char *)p +
+                ext4_rec_len_from_disk(p->rec_len));
+}
+/*
 * Future: use high four bits of block for coalesce-on-delete flags
 * Mask them off for now.
 */
@@ -231,13 +241,13 @@ static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
 {
        unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
                EXT4_DIR_REC_LEN(2) - infosize;
-        return 0? 20: entry_space / sizeof(struct dx_entry);
+        return entry_space / sizeof(struct dx_entry);
 }
 static inline unsigned dx_node_limit (struct inode *dir)
 {
        unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
-        return 0? 22: entry_space / sizeof(struct dx_entry);
+        return entry_space / sizeof(struct dx_entry);
 }
 /*
@@ -554,15 +564,6 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
 /*
- * p is at least 6 bytes before the end of page
- */
-static inline struct ext4_dir_entry_2 *ext4_next_entry(struct ext4_dir_entry_2 *p)
-{
-        return (struct ext4_dir_entry_2 *)((char *)p +
-                ext4_rec_len_from_disk(p->rec_len));
-}
-/*
 * This function fills a red-black tree with information from a
 * directory block.  It returns the number directory entries loaded
 * into the tree.  If there is an error it is returned in err.
@@ -993,19 +994,21 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
                de = (struct ext4_dir_entry_2 *) bh->b_data;
                top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
                                       EXT4_DIR_REC_LEN(0));
-                for (; de < top; de = ext4_next_entry(de))
+                for (; de < top; de = ext4_next_entry(de)) {
-                if (ext4_match (namelen, name, de)) {
+                        int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
-                        if (!ext4_check_dir_entry("ext4_find_entry",
+                                  + ((char *) de - bh->b_data);
-                                                  dir, de, bh,
-                                  (block<<EXT4_BLOCK_SIZE_BITS(sb))
+                        if (!ext4_check_dir_entry(__func__, dir, de, bh, off)) {
-                                          +((char *)de - bh->b_data))) {
+                                brelse(bh);
-                                brelse (bh);
                                *err = ERR_BAD_DX_DIR;
                                goto errout;
                        }
-                        *res_dir = de;
-                        dx_release (frames);
+                        if (ext4_match(namelen, name, de)) {
-                        return bh;
+                                *res_dir = de;
+                                dx_release(frames);
+                                return bh;
+                        }
                }
                brelse (bh);
                /* Check to see if we should continue to search */
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 9ecb92f68543..f000fbe2cd93 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -855,7 +855,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
         */
        /* Update group descriptor block for new group */
-        gdp = (struct ext4_group_desc *)primary->b_data + gdb_off;
+        gdp = (struct ext4_group_desc *)((char *)primary->b_data +
+                                         gdb_off * EXT4_DESC_SIZE(sb));
        ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
        ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
@@ -865,6 +866,15 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
        /*
+         * We can allocate memory for mb_alloc based on the new group
+         * descriptor
+         */
+        if (test_opt(sb, MBALLOC)) {
+                err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
+                if (err)
+                        goto exit_journal;
+        }
+        /*
         * Make the new blocks and inodes valid next.  We do this before
         * increasing the group count so that once the group is enabled,
         * all of its blocks and inodes are already valid.
@@ -956,6 +966,8 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
        handle_t *handle;
        int err;
        unsigned long freed_blocks;
+        ext4_group_t group;
+        struct ext4_group_info *grp;
        /* We don't need to worry about locking wrt other resizers just
         * yet: we're going to revalidate es->s_blocks_count after
@@ -987,7 +999,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
        }
        /* Handle the remaining blocks in the last group only. */
-        ext4_get_group_no_and_offset(sb, o_blocks_count, NULL, &last);
+        ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
        if (last == 0) {
                ext4_warning(sb, __func__,
@@ -1059,6 +1071,45 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
                   o_blocks_count + add);
        if ((err = ext4_journal_stop(handle)))
                goto exit_put;
+        /*
+         * Mark mballoc pages as not up to date so that they will be updated
+         * next time they are loaded by ext4_mb_load_buddy.
+         */
+        if (test_opt(sb, MBALLOC)) {
+                struct ext4_sb_info *sbi = EXT4_SB(sb);
+                struct inode *inode = sbi->s_buddy_cache;
+                int blocks_per_page;
+                int block;
+                int pnum;
+                struct page *page;
+                /* Set buddy page as not up to date */
+                blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+                block = group * 2;
+                pnum = block / blocks_per_page;
+                page = find_get_page(inode->i_mapping, pnum);
+                if (page != NULL) {
+                        ClearPageUptodate(page);
+                        page_cache_release(page);
+                }
+                /* Set bitmap page as not up to date */
+                block++;
+                pnum = block / blocks_per_page;
+                page = find_get_page(inode->i_mapping, pnum);
+                if (page != NULL) {
+                        ClearPageUptodate(page);
+                        page_cache_release(page);
+                }
+                /* Get the info on the last group */
+                grp = ext4_get_group_info(sb, group);
+                /* Update free blocks in group info */
+                ext4_mb_update_group_info(grp, add);
+        }
        if (test_opt(sb, DEBUG))
                printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
                       ext4_blocks_count(es));
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index cb96f127c366..1cb371dcd609 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -506,6 +506,7 @@ static void ext4_put_super (struct super_block * sb)
        ext4_ext_release(sb);
        ext4_xattr_put_super(sb);
        jbd2_journal_destroy(sbi->s_journal);
+        sbi->s_journal = NULL;
        if (!(sb->s_flags & MS_RDONLY)) {
                EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
                es->s_state = cpu_to_le16(sbi->s_mount_state);
@@ -517,6 +518,7 @@ static void ext4_put_super (struct super_block * sb)
        for (i = 0; i < sbi->s_gdb_count; i++)
                brelse(sbi->s_group_desc[i]);
        kfree(sbi->s_group_desc);
+        kfree(sbi->s_flex_groups);
        percpu_counter_destroy(&sbi->s_freeblocks_counter);
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
        percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -571,6 +573,12 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
        INIT_LIST_HEAD(&ei->i_prealloc_list);
        spin_lock_init(&ei->i_prealloc_lock);
+        jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
+        ei->i_reserved_data_blocks = 0;
+        ei->i_reserved_meta_blocks = 0;
+        ei->i_allocated_meta_blocks = 0;
+        ei->i_delalloc_reserved_flag = 0;
+        spin_lock_init(&(ei->i_block_reservation_lock));
        return &ei->vfs_inode;
 }
@@ -635,6 +643,8 @@ static void ext4_clear_inode(struct inode *inode)
        EXT4_I(inode)->i_block_alloc_info = NULL;
        if (unlikely(rsv))
                kfree(rsv);
+        jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
+                                       &EXT4_I(inode)->jinode);
 }
 static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb)
@@ -671,7 +681,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
        unsigned long def_mount_opts;
        struct super_block *sb = vfs->mnt_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        journal_t *journal = sbi->s_journal;
        struct ext4_super_block *es = sbi->s_es;
        def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
@@ -747,6 +756,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_puts(seq, ",nomballoc");
        if (test_opt(sb, I_VERSION))
                seq_puts(seq, ",i_version");
+        if (!test_opt(sb, DELALLOC))
+                seq_puts(seq, ",nodelalloc");
        if (sbi->s_stripe)
                seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
@@ -894,7 +906,7 @@ enum {
        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
        Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
        Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
-        Opt_mballoc, Opt_nomballoc, Opt_stripe,
+        Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc,
 };
 static match_table_t tokens = {
@@ -953,6 +965,8 @@ static match_table_t tokens = {
        {Opt_nomballoc, "nomballoc"},
        {Opt_stripe, "stripe=%u"},
        {Opt_resize, "resize"},
+        {Opt_delalloc, "delalloc"},
+        {Opt_nodelalloc, "nodelalloc"},
        {Opt_err, NULL},
 };
@@ -990,6 +1004,7 @@ static int parse_options (char *options, struct super_block *sb,
        int qtype, qfmt;
        char *qname;
 #endif
+        ext4_fsblk_t last_block;
        if (!options)
                return 1;
@@ -1309,15 +1324,39 @@ set_qf_format:
                        clear_opt(sbi->s_mount_opt, NOBH);
                        break;
                case Opt_extents:
+                        if (!EXT4_HAS_INCOMPAT_FEATURE(sb,
+                                        EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+                                ext4_warning(sb, __func__,
+                                        "extents feature not enabled "
+                                        "on this filesystem, use tune2fs\n");
+                                return 0;
+                        }
                        set_opt (sbi->s_mount_opt, EXTENTS);
                        break;
                case Opt_noextents:
+                        /*
+                         * When e2fsprogs support resizing an already existing
+                         * ext3 file system to greater than 2**32 we need to
+                         * add support to block allocator to handle growing
+                         * already existing block  mapped inode so that blocks
+                         * allocated for them fall within 2**32
+                         */
+                        last_block = ext4_blocks_count(sbi->s_es) - 1;
+                        if (last_block  > 0xffffffffULL) {
+                                printk(KERN_ERR "EXT4-fs: Filesystem too "
+                                                "large to mount with "
+                                                "-o noextents options\n");
+                                return 0;
+                        }
                        clear_opt (sbi->s_mount_opt, EXTENTS);
                        break;
                case Opt_i_version:
                        set_opt(sbi->s_mount_opt, I_VERSION);
                        sb->s_flags |= MS_I_VERSION;
                        break;
+                case Opt_nodelalloc:
+                        clear_opt(sbi->s_mount_opt, DELALLOC);
+                        break;
                case Opt_mballoc:
                        set_opt(sbi->s_mount_opt, MBALLOC);
                        break;
@@ -1331,6 +1370,9 @@ set_qf_format:
                                return 0;
                        sbi->s_stripe = option;
                        break;
+                case Opt_delalloc:
+                        set_opt(sbi->s_mount_opt, DELALLOC);
+                        break;
                default:
                        printk (KERN_ERR
                                "EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1443,6 +1485,54 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
        return res;
 }
+static int ext4_fill_flex_info(struct super_block *sb)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_group_desc *gdp = NULL;
+        struct buffer_head *bh;
+        ext4_group_t flex_group_count;
+        ext4_group_t flex_group;
+        int groups_per_flex = 0;
+        __u64 block_bitmap = 0;
+        int i;
+        if (!sbi->s_es->s_log_groups_per_flex) {
+                sbi->s_log_groups_per_flex = 0;
+                return 1;
+        }
+        sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
+        groups_per_flex = 1 << sbi->s_log_groups_per_flex;
+        flex_group_count = (sbi->s_groups_count + groups_per_flex - 1) /
+                groups_per_flex;
+        sbi->s_flex_groups = kmalloc(flex_group_count *
+                                     sizeof(struct flex_groups), GFP_KERNEL);
+        if (sbi->s_flex_groups == NULL) {
+                printk(KERN_ERR "EXT4-fs: not enough memory\n");
+                goto failed;
+        }
+        memset(sbi->s_flex_groups, 0, flex_group_count *
+               sizeof(struct flex_groups));
+        gdp = ext4_get_group_desc(sb, 1, &bh);
+        block_bitmap = ext4_block_bitmap(sb, gdp) - 1;
+        for (i = 0; i < sbi->s_groups_count; i++) {
+                gdp = ext4_get_group_desc(sb, i, &bh);
+                flex_group = ext4_flex_group(sbi, i);
+                sbi->s_flex_groups[flex_group].free_inodes +=
+                        le16_to_cpu(gdp->bg_free_inodes_count);
+                sbi->s_flex_groups[flex_group].free_blocks +=
+                        le16_to_cpu(gdp->bg_free_blocks_count);
+        }
+        return 1;
+failed:
+        return 0;
+}
 __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
                            struct ext4_group_desc *gdp)
 {
@@ -1810,8 +1900,8 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
 }
 static int ext4_fill_super (struct super_block *sb, void *data, int silent)
-                                __releases(kernel_sem)
+                                __releases(kernel_lock)
-                                __acquires(kernel_sem)
+                                __acquires(kernel_lock)
 {
        struct buffer_head * bh;
@@ -1851,11 +1941,6 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
                goto out_fail;
        }
-        if (!sb_set_blocksize(sb, blocksize)) {
-                printk(KERN_ERR "EXT4-fs: bad blocksize %d.\n", blocksize);
-                goto out_fail;
-        }
        /*
         * The ext4 superblock will not be buffer aligned for other than 1kB
         * block sizes.  We need to calculate the offset from buffer start.
@@ -1919,15 +2004,28 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
        /*
         * turn on extents feature by default in ext4 filesystem
-         * User -o noextents to turn it off
+         * only if feature flag already set by mkfs or tune2fs.
+         * Use -o noextents to turn it off
         */
-        set_opt(sbi->s_mount_opt, EXTENTS);
+        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
+                set_opt(sbi->s_mount_opt, EXTENTS);
+        else
+                ext4_warning(sb, __func__,
+                        "extents feature not enabled on this filesystem, "
+                        "use tune2fs.\n");
        /*
-         * turn on mballoc feature by default in ext4 filesystem
+         * turn on mballoc code by default in ext4 filesystem
-         * User -o nomballoc to turn it off
+         * Use -o nomballoc to turn it off
         */
        set_opt(sbi->s_mount_opt, MBALLOC);
+        /*
+         * enable delayed allocation by default
+         * Use -o nodelalloc to turn it off
+         */
+        set_opt(sbi->s_mount_opt, DELALLOC);
        if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
                            NULL, 0))
                goto failed_mount;
@@ -2138,6 +2236,14 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
                printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n");
                goto failed_mount2;
        }
+        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+                if (!ext4_fill_flex_info(sb)) {
+                        printk(KERN_ERR
+                               "EXT4-fs: unable to initialize "
+                               "flex_bg meta info!\n");
+                        goto failed_mount2;
+                }
        sbi->s_gdb_count = db_count;
        get_random_bytes(&sbi->s_next_generation, sizeof(u32));
        spin_lock_init(&sbi->s_next_gen_lock);
@@ -2358,6 +2464,13 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
                test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
                "writeback");
+        if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+                printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
+                                "requested data journaling mode\n");
+                clear_opt(sbi->s_mount_opt, DELALLOC);
+        } else if (test_opt(sb, DELALLOC))
+                printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
        ext4_ext_init(sb);
        ext4_mb_init(sb, needs_recovery);
@@ -2372,6 +2485,7 @@ cantfind_ext4:
 failed_mount4:
        jbd2_journal_destroy(sbi->s_journal);
+        sbi->s_journal = NULL;
 failed_mount3:
        percpu_counter_destroy(&sbi->s_freeblocks_counter);
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
@@ -3325,7 +3439,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
                        err = ext4_journal_dirty_metadata(handle, bh);
                else {
                        /* Always do at least ordered writes for quotas */
-                        err = ext4_journal_dirty_data(handle, bh);
+                        err = ext4_jbd2_file_inode(handle, inode);
                        mark_buffer_dirty(bh);
                }
                brelse(bh);
@@ -3337,8 +3451,10 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
                blk++;
        }
 out:
-        if (len == towrite)
+        if (len == towrite) {
+                mutex_unlock(&inode->i_mutex);
                return err;
+        }
        if (inode->i_size < off+len-towrite) {
                i_size_write(inode, off+len-towrite);
                EXT4_I(inode)->i_disksize = inode->i_size;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index ff08633f398e..93c5fdcdad2e 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -810,7 +810,7 @@ inserted:
                        /* We need to allocate a new block */
                        ext4_fsblk_t goal = ext4_group_first_block_no(sb,
                                                EXT4_I(inode)->i_block_group);
-                        ext4_fsblk_t block = ext4_new_block(handle, inode,
+                        ext4_fsblk_t block = ext4_new_meta_block(handle, inode,
                                                        goal, &error);
                        if (error)
                                goto cleanup;
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index fff33382cadc..ac1a52cf2a37 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -13,13 +13,11 @@
 #include "ext4.h"
 #include "xattr.h"
-#define XATTR_TRUSTED_PREFIX "trusted."
 static size_t
 ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
                        const char *name, size_t name_len)
 {
-        const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1;
+        const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
        const size_t total_len = prefix_len + name_len + 1;
        if (!capable(CAP_SYS_ADMIN))
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index 67be723fcc4e..d91aa61b42aa 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -12,13 +12,11 @@
 #include "ext4.h"
 #include "xattr.h"
-#define XATTR_USER_PREFIX "user."
 static size_t
 ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size,
                     const char *name, size_t name_len)
 {
-        const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1;
+        const size_t prefix_len = XATTR_USER_PREFIX_LEN;
        const size_t total_len = prefix_len + name_len + 1;
        if (!test_opt(inode->i_sb, XATTR_USER))
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index fda25479af26..3a9ecac8d61f 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -61,7 +61,7 @@ void fat_cache_destroy(void)
 static inline struct fat_cache *fat_cache_alloc(struct inode *inode)
 {
-        return kmem_cache_alloc(fat_cache_cachep, GFP_KERNEL);
+        return kmem_cache_alloc(fat_cache_cachep, GFP_NOFS);
 }
 static inline void fat_cache_free(struct fat_cache *cache)
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 486725ee99ae..34541d06e626 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -472,7 +472,7 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
        loff_t cpos;
        int ret = 0;
-        lock_kernel();
+        lock_super(sb);
        cpos = filp->f_pos;
        /* Fake . and .. for the root directory. */
@@ -654,7 +654,7 @@ FillFailed:
        if (unicode)
                __putname(unicode);
 out:
-        unlock_kernel();
+        unlock_super(sb);
        return ret;
 }
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 27cc1164ec36..c672df4036e9 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -11,7 +11,6 @@
 #include <linux/mount.h>
 #include <linux/time.h>
 #include <linux/msdos_fs.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
@@ -242,9 +241,7 @@ void fat_truncate(struct inode *inode)
        nr_clusters = (inode->i_size + (cluster_size - 1)) >> sbi->cluster_bits;
-        lock_kernel();
        fat_free(inode, nr_clusters);
-        unlock_kernel();
        fat_flush_inodes(inode->i_sb, inode, NULL);
 }
@@ -257,26 +254,34 @@ int fat_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 }
 EXPORT_SYMBOL_GPL(fat_getattr);
-static int fat_check_mode(const struct msdos_sb_info *sbi, struct inode *inode,
+static int fat_sanitize_mode(const struct msdos_sb_info *sbi,
-                          mode_t mode)
+                             struct inode *inode, umode_t *mode_ptr)
 {
-        mode_t mask, req = mode & ~S_IFMT;
+        mode_t mask, perm;
-        if (S_ISREG(mode))
+        /*
+         * Note, the basic check is already done by a caller of
+         * (attr->ia_mode & ~MSDOS_VALID_MODE)
+         */
+        if (S_ISREG(inode->i_mode))
                mask = sbi->options.fs_fmask;
        else
                mask = sbi->options.fs_dmask;
+        perm = *mode_ptr & ~(S_IFMT | mask);
        /*
         * Of the r and x bits, all (subject to umask) must be present. Of the
         * w bits, either all (subject to umask) or none must be present.
         */
-        req &= ~mask;
+        if ((perm & (S_IRUGO | S_IXUGO)) != (inode->i_mode & (S_IRUGO|S_IXUGO)))
-        if ((req & (S_IRUGO | S_IXUGO)) != (inode->i_mode & (S_IRUGO|S_IXUGO)))
                return -EPERM;
-        if ((req & S_IWUGO) && ((req & S_IWUGO) != (S_IWUGO & ~mask)))
+        if ((perm & S_IWUGO) && ((perm & S_IWUGO) != (S_IWUGO & ~mask)))
                return -EPERM;
+        *mode_ptr &= S_IFMT | perm;
        return 0;
 }
@@ -299,11 +304,9 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
 {
        struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb);
        struct inode *inode = dentry->d_inode;
-        int mask, error = 0;
+        int error = 0;
        unsigned int ia_valid;
-        lock_kernel();
        /*
         * Expand the file. Since inode_setattr() updates ->i_size
         * before calling the ->truncate(), but FAT needs to fill the
@@ -332,12 +335,13 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
                        error = 0;
                goto out;
        }
        if (((attr->ia_valid & ATTR_UID) &&
             (attr->ia_uid != sbi->options.fs_uid)) ||
            ((attr->ia_valid & ATTR_GID) &&
             (attr->ia_gid != sbi->options.fs_gid)) ||
            ((attr->ia_valid & ATTR_MODE) &&
-             fat_check_mode(sbi, inode, attr->ia_mode) < 0))
+             (attr->ia_mode & ~MSDOS_VALID_MODE)))
                error = -EPERM;
        if (error) {
@@ -346,17 +350,17 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
                goto out;
        }
-        error = inode_setattr(inode, attr);
+        /*
-        if (error)
+         * We don't return -EPERM here. Yes, strange, but this is too
-                goto out;
+         * old behavior.
+         */
+        if (attr->ia_valid & ATTR_MODE) {
+                if (fat_sanitize_mode(sbi, inode, &attr->ia_mode) < 0)
+                        attr->ia_valid &= ~ATTR_MODE;
+        }
-        if (S_ISDIR(inode->i_mode))
+        error = inode_setattr(inode, attr);
-                mask = sbi->options.fs_dmask;
-        else
-                mask = sbi->options.fs_fmask;
-        inode->i_mode &= S_IFMT | (S_IRWXUGO & ~mask);
 out:
-        unlock_kernel();
        return error;
 }
 EXPORT_SYMBOL_GPL(fat_setattr);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 4e0a3dd9d677..46a4508ffd2e 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -440,14 +440,13 @@ static void fat_delete_inode(struct inode *inode)
 static void fat_clear_inode(struct inode *inode)
 {
-        struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+        struct super_block *sb = inode->i_sb;
+        struct msdos_sb_info *sbi = MSDOS_SB(sb);
-        lock_kernel();
        spin_lock(&sbi->inode_hash_lock);
        fat_cache_inval_inode(inode);
        hlist_del_init(&MSDOS_I(inode)->i_fat_hash);
        spin_unlock(&sbi->inode_hash_lock);
-        unlock_kernel();
 }
 static void fat_write_super(struct super_block *sb)
@@ -485,7 +484,7 @@ static struct kmem_cache *fat_inode_cachep;
 static struct inode *fat_alloc_inode(struct super_block *sb)
 {
        struct msdos_inode_info *ei;
-        ei = kmem_cache_alloc(fat_inode_cachep, GFP_KERNEL);
+        ei = kmem_cache_alloc(fat_inode_cachep, GFP_NOFS);
        if (!ei)
                return NULL;
        return &ei->vfs_inode;
@@ -567,7 +566,7 @@ retry:
        if (inode->i_ino == MSDOS_ROOT_INO || !i_pos)
                return 0;
-        lock_kernel();
+        lock_super(sb);
        bh = sb_bread(sb, i_pos >> sbi->dir_per_block_bits);
        if (!bh) {
                printk(KERN_ERR "FAT: unable to read inode block "
@@ -579,7 +578,7 @@ retry:
        if (i_pos != MSDOS_I(inode)->i_pos) {
                spin_unlock(&sbi->inode_hash_lock);
                brelse(bh);
-                unlock_kernel();
+                unlock_super(sb);
                goto retry;
        }
@@ -606,7 +605,7 @@ retry:
                err = sync_dirty_buffer(bh);
        brelse(bh);
 out:
-        unlock_kernel();
+        unlock_super(sb);
        return err;
 }
@@ -736,6 +735,7 @@ fat_encode_fh(struct dentry *de, __u32 *fh, int *lenp, int connectable)
 static struct dentry *fat_get_parent(struct dentry *child)
 {
+        struct super_block *sb = child->d_sb;
        struct buffer_head *bh;
        struct msdos_dir_entry *de;
        loff_t i_pos;
@@ -743,14 +743,14 @@ static struct dentry *fat_get_parent(struct dentry *child)
        struct inode *inode;
        int err;
-        lock_kernel();
+        lock_super(sb);
        err = fat_get_dotdot_entry(child->d_inode, &bh, &de, &i_pos);
        if (err) {
                parent = ERR_PTR(err);
                goto out;
        }
-        inode = fat_build_inode(child->d_sb, de, i_pos);
+        inode = fat_build_inode(sb, de, i_pos);
        brelse(bh);
        if (IS_ERR(inode)) {
                parent = ERR_CAST(inode);
@@ -762,7 +762,7 @@ static struct dentry *fat_get_parent(struct dentry *child)
                parent = ERR_PTR(-ENOMEM);
        }
 out:
-        unlock_kernel();
+        unlock_super(sb);
        return parent;
 }
@@ -1172,6 +1172,12 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
        long error;
        char buf[50];
+        /*
+         * GFP_KERNEL is ok here, because while we do hold the
+         * supeblock lock, memory pressure can't call back into
+         * the filesystem, since we're only just about to mount
+         * it and have no inodes etc active!
+         */
        sbi = kzalloc(sizeof(struct msdos_sb_info), GFP_KERNEL);
        if (!sbi)
                return -ENOMEM;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index bfd776509a72..330a7d782591 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -12,7 +12,6 @@
 #include <linux/fdtable.h>
 #include <linux/capability.h>
 #include <linux/dnotify.h>
-#include <linux/smp_lock.h>
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/security.h>
@@ -227,7 +226,6 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
        if (error)
                return error;
-        lock_kernel();
        if ((arg ^ filp->f_flags) & FASYNC) {
                if (filp->f_op && filp->f_op->fasync) {
                        error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0);
@@ -238,7 +236,6 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
        filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
 out:
-        unlock_kernel();
        return error;
 }
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index ae45f77765c0..25adfc3c693a 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -424,8 +424,6 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so
 * that it can be located for waiting on in __writeback_single_inode().
 *
- * Called under inode_lock.
- *
 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
 * This function assumes that the blockdev superblock's inodes are backed by
 * a variety of queues, so all inodes are searched.  For other superblocks,
@@ -441,11 +439,12 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 * on the writer throttling path, and we get decent balancing between many
 * throttled threads: we don't want them all piling up on inode_sync_wait.
 */
-static void
+void generic_sync_sb_inodes(struct super_block *sb,
-sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
+                                struct writeback_control *wbc)
 {
        const unsigned long start = jiffies;    /* livelock avoidance */
+        spin_lock(&inode_lock);
        if (!wbc->for_kupdate || list_empty(&sb->s_io))
                queue_io(sb, wbc->older_than_this);
@@ -524,8 +523,16 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
                if (!list_empty(&sb->s_more_io))
                        wbc->more_io = 1;
        }
+        spin_unlock(&inode_lock);
        return;         /* Leave any unwritten inodes on s_io */
 }
+EXPORT_SYMBOL_GPL(generic_sync_sb_inodes);
+static void sync_sb_inodes(struct super_block *sb,
+                                struct writeback_control *wbc)
+{
+        generic_sync_sb_inodes(sb, wbc);
+}
 /*
 * Start writeback of dirty pagecache data against all unlocked inodes.
@@ -565,11 +572,8 @@ restart:
                         * be unmounted by the time it is released.
                         */
                        if (down_read_trylock(&sb->s_umount)) {
-                                if (sb->s_root) {
+                                if (sb->s_root)
-                                        spin_lock(&inode_lock);
                                        sync_sb_inodes(sb, wbc);
-                                        spin_unlock(&inode_lock);
-                                }
                                up_read(&sb->s_umount);
                        }
                        spin_lock(&sb_lock);
@@ -607,9 +611,7 @@ void sync_inodes_sb(struct super_block *sb, int wait)
                        (inodes_stat.nr_inodes - inodes_stat.nr_unused) +
                        nr_dirty + nr_unstable;
        wbc.nr_to_write += wbc.nr_to_write / 2;         /* Bit more for luck */
-        spin_lock(&inode_lock);
        sync_sb_inodes(sb, &wbc);
-        spin_unlock(&inode_lock);
 }
 /*
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 43e99513334a..3141690558c8 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -591,7 +591,7 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
                fc->bdi.ra_pages = min(fc->bdi.ra_pages, ra_pages);
                fc->minor = arg->minor;
                fc->max_write = arg->minor < 5 ? 4096 : arg->max_write;
-                fc->max_write = min_t(unsigned, 4096, fc->max_write);
+                fc->max_write = max_t(unsigned, 4096, fc->max_write);
                fc->conn_init = 1;
        }
        fuse_put_request(fc, req);
@@ -667,7 +667,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
        fc->flags = d.flags;
        fc->user_id = d.user_id;
        fc->group_id = d.group_id;
-        fc->max_read = min_t(unsigned, 4096, d.max_read);
+        fc->max_read = max_t(unsigned, 4096, d.max_read);
        /* Used by get_root_inode() */
        sb->s_fs_info = fc;
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 7f7947e3dfbb..ab2f57e3fb87 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -14,23 +14,11 @@ config GFS2_FS
          GFS is perfect consistency -- changes made to the filesystem on one
          machine show up immediately on all other machines in the cluster.
-          To use the GFS2 filesystem, you will need to enable one or more of
+          To use the GFS2 filesystem in a cluster, you will need to enable
-          the below locking modules. Documentation and utilities for GFS2 can
+          the locking module below. Documentation and utilities for GFS2 can
          be found here: http://sources.redhat.com/cluster
-config GFS2_FS_LOCKING_NOLOCK
+          The "nolock" lock module is now built in to GFS2 by default.
-        tristate "GFS2 \"nolock\" locking module"
-        depends on GFS2_FS
-        help
-          Single node locking module for GFS2.
-          Use this module if you want to use GFS2 on a single node without
-          its clustering features. You can still take advantage of the
-          large file support, and upgrade to running a full cluster later on
-          if required.
-          If you will only be using GFS2 in cluster mode, you do not need this
-          module.
 config GFS2_FS_LOCKING_DLM
        tristate "GFS2 DLM locking module"
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index e2350df02a07..ec65851ec80a 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -5,6 +5,5 @@ gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \
        ops_fstype.o ops_inode.o ops_super.o quota.o \
        recovery.o rgrp.o super.o sys.o trans.o util.o
-obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += locking/nolock/
 obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += locking/dlm/
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index c19184f2e70e..bec76b1c2bb0 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -246,15 +246,11 @@ static void find_metapath(const struct gfs2_sbd *sdp, u64 block,
 }
-static inline unsigned int zero_metapath_length(const struct metapath *mp,
+static inline unsigned int metapath_branch_start(const struct metapath *mp)
-                                                unsigned height)
 {
-        unsigned int i;
+        if (mp->mp_list[0] == 0)
-        for (i = 0; i < height - 1; i++) {
+                return 2;
-                if (mp->mp_list[i] != 0)
+        return 1;
-                        return i;
-        }
-        return height;
 }
 /**
@@ -436,7 +432,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
        struct gfs2_sbd *sdp = GFS2_SB(inode);
        struct buffer_head *dibh = mp->mp_bh[0];
        u64 bn, dblock = 0;
-        unsigned n, i, blks, alloced = 0, iblks = 0, zmpl = 0;
+        unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
        unsigned dblks = 0;
        unsigned ptrs_per_blk;
        const unsigned end_of_metadata = height - 1;
@@ -471,9 +467,8 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
                        /* Building up tree height */
                        state = ALLOC_GROW_HEIGHT;
                        iblks = height - ip->i_height;
-                        zmpl = zero_metapath_length(mp, height);
+                        branch_start = metapath_branch_start(mp);
-                        iblks -= zmpl;
+                        iblks += (height - branch_start);
-                        iblks += height;
                }
        }
@@ -509,13 +504,13 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
                                        sizeof(struct gfs2_meta_header));
                                *ptr = zero_bn;
                                state = ALLOC_GROW_DEPTH;
-                                for(i = zmpl; i < height; i++) {
+                                for(i = branch_start; i < height; i++) {
                                        if (mp->mp_bh[i] == NULL)
                                                break;
                                        brelse(mp->mp_bh[i]);
                                        mp->mp_bh[i] = NULL;
                                }
-                                i = zmpl;
+                                i = branch_start;
                        }
                        if (n == 0)
                                break;
diff --git a/fs/gfs2/gfs2.h b/fs/gfs2/gfs2.h
index 3bb11c0f8b56..ef606e3a5cf4 100644
--- a/fs/gfs2/gfs2.h
+++ b/fs/gfs2/gfs2.h
@@ -16,11 +16,6 @@ enum {
 };
 enum {
-        NO_WAIT = 0,
-        WAIT = 1,
-};
-enum {
        NO_FORCE = 0,
        FORCE = 1,
 };
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index d636b3e80f5d..13391e546616 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -45,21 +45,19 @@ struct gfs2_gl_hash_bucket {
        struct hlist_head hb_list;
 };
-struct glock_iter {
+struct gfs2_glock_iter {
-        int hash;                     /* hash bucket index         */
+        int hash;                       /* hash bucket index         */
-        struct gfs2_sbd *sdp;         /* incore superblock         */
+        struct gfs2_sbd *sdp;           /* incore superblock         */
-        struct gfs2_glock *gl;        /* current glock struct      */
+        struct gfs2_glock *gl;          /* current glock struct      */
-        struct seq_file *seq;         /* sequence file for debugfs */
+        char string[512];               /* scratch space             */
-        char string[512];             /* scratch space             */
 };
 typedef void (*glock_examiner) (struct gfs2_glock * gl);
 static int gfs2_dump_lockstate(struct gfs2_sbd *sdp);
-static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl);
+static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
-static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh);
+#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0)
-static void gfs2_glock_drop_th(struct gfs2_glock *gl);
+static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target);
-static void run_queue(struct gfs2_glock *gl);
 static DECLARE_RWSEM(gfs2_umount_flush_sem);
 static struct dentry *gfs2_root;
@@ -123,33 +121,6 @@ static inline rwlock_t *gl_lock_addr(unsigned int x)
 #endif
 /**
- * relaxed_state_ok - is a requested lock compatible with the current lock mode?
- * @actual: the current state of the lock
- * @requested: the lock state that was requested by the caller
- * @flags: the modifier flags passed in by the caller
- *
- * Returns: 1 if the locks are compatible, 0 otherwise
- */
-static inline int relaxed_state_ok(unsigned int actual, unsigned requested,
-                                   int flags)
-{
-        if (actual == requested)
-                return 1;
-        if (flags & GL_EXACT)
-                return 0;
-        if (actual == LM_ST_EXCLUSIVE && requested == LM_ST_SHARED)
-                return 1;
-        if (actual != LM_ST_UNLOCKED && (flags & LM_FLAG_ANY))
-                return 1;
-        return 0;
-}
-/**
 * gl_hash() - Turn glock number into hash bucket number
 * @lock: The glock number
 *
@@ -182,7 +153,7 @@ static void glock_free(struct gfs2_glock *gl)
        struct gfs2_sbd *sdp = gl->gl_sbd;
        struct inode *aspace = gl->gl_aspace;
-        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+        if (sdp->sd_lockstruct.ls_ops->lm_put_lock)
                sdp->sd_lockstruct.ls_ops->lm_put_lock(gl->gl_lock);
        if (aspace)
@@ -211,17 +182,14 @@ static void gfs2_glock_hold(struct gfs2_glock *gl)
 int gfs2_glock_put(struct gfs2_glock *gl)
 {
        int rv = 0;
-        struct gfs2_sbd *sdp = gl->gl_sbd;
        write_lock(gl_lock_addr(gl->gl_hash));
        if (atomic_dec_and_test(&gl->gl_ref)) {
                hlist_del(&gl->gl_list);
                write_unlock(gl_lock_addr(gl->gl_hash));
-                gfs2_assert(sdp, gl->gl_state == LM_ST_UNLOCKED);
+                GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_UNLOCKED);
-                gfs2_assert(sdp, list_empty(&gl->gl_reclaim));
+                GLOCK_BUG_ON(gl, !list_empty(&gl->gl_reclaim));
-                gfs2_assert(sdp, list_empty(&gl->gl_holders));
+                GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
-                gfs2_assert(sdp, list_empty(&gl->gl_waiters1));
-                gfs2_assert(sdp, list_empty(&gl->gl_waiters3));
                glock_free(gl);
                rv = 1;
                goto out;
@@ -281,22 +249,401 @@ static struct gfs2_glock *gfs2_glock_find(const struct gfs2_sbd *sdp,
        return gl;
 }
+/**
+ * may_grant - check if its ok to grant a new lock
+ * @gl: The glock
+ * @gh: The lock request which we wish to grant
+ *
+ * Returns: true if its ok to grant the lock
+ */
+static inline int may_grant(const struct gfs2_glock *gl, const struct gfs2_holder *gh)
+{
+        const struct gfs2_holder *gh_head = list_entry(gl->gl_holders.next, const struct gfs2_holder, gh_list);
+        if ((gh->gh_state == LM_ST_EXCLUSIVE ||
+             gh_head->gh_state == LM_ST_EXCLUSIVE) && gh != gh_head)
+                return 0;
+        if (gl->gl_state == gh->gh_state)
+                return 1;
+        if (gh->gh_flags & GL_EXACT)
+                return 0;
+        if (gl->gl_state == LM_ST_EXCLUSIVE) {
+                if (gh->gh_state == LM_ST_SHARED && gh_head->gh_state == LM_ST_SHARED)
+                        return 1;
+                if (gh->gh_state == LM_ST_DEFERRED && gh_head->gh_state == LM_ST_DEFERRED)
+                        return 1;
+        }
+        if (gl->gl_state != LM_ST_UNLOCKED && (gh->gh_flags & LM_FLAG_ANY))
+                return 1;
+        return 0;
+}
+static void gfs2_holder_wake(struct gfs2_holder *gh)
+{
+        clear_bit(HIF_WAIT, &gh->gh_iflags);
+        smp_mb__after_clear_bit();
+        wake_up_bit(&gh->gh_iflags, HIF_WAIT);
+}
+/**
+ * do_promote - promote as many requests as possible on the current queue
+ * @gl: The glock
+ * 
+ * Returns: true if there is a blocked holder at the head of the list
+ */
+static int do_promote(struct gfs2_glock *gl)
+{
+        const struct gfs2_glock_operations *glops = gl->gl_ops;
+        struct gfs2_holder *gh, *tmp;
+        int ret;
+restart:
+        list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
+                if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+                        continue;
+                if (may_grant(gl, gh)) {
+                        if (gh->gh_list.prev == &gl->gl_holders &&
+                            glops->go_lock) {
+                                spin_unlock(&gl->gl_spin);
+                                /* FIXME: eliminate this eventually */
+                                ret = glops->go_lock(gh);
+                                spin_lock(&gl->gl_spin);
+                                if (ret) {
+                                        gh->gh_error = ret;
+                                        list_del_init(&gh->gh_list);
+                                        gfs2_holder_wake(gh);
+                                        goto restart;
+                                }
+                                set_bit(HIF_HOLDER, &gh->gh_iflags);
+                                gfs2_holder_wake(gh);
+                                goto restart;
+                        }
+                        set_bit(HIF_HOLDER, &gh->gh_iflags);
+                        gfs2_holder_wake(gh);
+                        continue;
+                }
+                if (gh->gh_list.prev == &gl->gl_holders)
+                        return 1;
+                break;
+        }
+        return 0;
+}
+/**
+ * do_error - Something unexpected has happened during a lock request
+ *
+ */
+static inline void do_error(struct gfs2_glock *gl, const int ret)
+{
+        struct gfs2_holder *gh, *tmp;
+        list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
+                if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+                        continue;
+                if (ret & LM_OUT_ERROR)
+                        gh->gh_error = -EIO;
+                else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))
+                        gh->gh_error = GLR_TRYFAILED;
+                else
+                        continue;
+                list_del_init(&gh->gh_list);
+                gfs2_holder_wake(gh);
+        }
+}
+/**
+ * find_first_waiter - find the first gh that's waiting for the glock
+ * @gl: the glock
+ */
+static inline struct gfs2_holder *find_first_waiter(const struct gfs2_glock *gl)
+{
+        struct gfs2_holder *gh;
+        list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+                if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
+                        return gh;
+        }
+        return NULL;
+}
+/**
+ * state_change - record that the glock is now in a different state
+ * @gl: the glock
+ * @new_state the new state
+ *
+ */
+static void state_change(struct gfs2_glock *gl, unsigned int new_state)
+{
+        int held1, held2;
+        held1 = (gl->gl_state != LM_ST_UNLOCKED);
+        held2 = (new_state != LM_ST_UNLOCKED);
+        if (held1 != held2) {
+                if (held2)
+                        gfs2_glock_hold(gl);
+                else
+                        gfs2_glock_put(gl);
+        }
+        gl->gl_state = new_state;
+        gl->gl_tchange = jiffies;
+}
+static void gfs2_demote_wake(struct gfs2_glock *gl)
+{
+        gl->gl_demote_state = LM_ST_EXCLUSIVE;
+        clear_bit(GLF_DEMOTE, &gl->gl_flags);
+        smp_mb__after_clear_bit();
+        wake_up_bit(&gl->gl_flags, GLF_DEMOTE);
+}
+/**
+ * finish_xmote - The DLM has replied to one of our lock requests
+ * @gl: The glock
+ * @ret: The status from the DLM
+ *
+ */
+static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
+{
+        const struct gfs2_glock_operations *glops = gl->gl_ops;
+        struct gfs2_holder *gh;
+        unsigned state = ret & LM_OUT_ST_MASK;
+        spin_lock(&gl->gl_spin);
+        state_change(gl, state);
+        gh = find_first_waiter(gl);
+        /* Demote to UN request arrived during demote to SH or DF */
+        if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
+            state != LM_ST_UNLOCKED && gl->gl_demote_state == LM_ST_UNLOCKED)
+                gl->gl_target = LM_ST_UNLOCKED;
+        /* Check for state != intended state */
+        if (unlikely(state != gl->gl_target)) {
+                if (gh && !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) {
+                        /* move to back of queue and try next entry */
+                        if (ret & LM_OUT_CANCELED) {
+                                if ((gh->gh_flags & LM_FLAG_PRIORITY) == 0)
+                                        list_move_tail(&gh->gh_list, &gl->gl_holders);
+                                gh = find_first_waiter(gl);
+                                gl->gl_target = gh->gh_state;
+                                goto retry;
+                        }
+                        /* Some error or failed "try lock" - report it */
+                        if ((ret & LM_OUT_ERROR) ||
+                            (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) {
+                                gl->gl_target = gl->gl_state;
+                                do_error(gl, ret);
+                                goto out;
+                        }
+                }
+                switch(state) {
+                /* Unlocked due to conversion deadlock, try again */
+                case LM_ST_UNLOCKED:
+retry:
+                        do_xmote(gl, gh, gl->gl_target);
+                        break;
+                /* Conversion fails, unlock and try again */
+                case LM_ST_SHARED:
+                case LM_ST_DEFERRED:
+                        do_xmote(gl, gh, LM_ST_UNLOCKED);
+                        break;
+                default: /* Everything else */
+                        printk(KERN_ERR "GFS2: wanted %u got %u\n", gl->gl_target, state);
+                        GLOCK_BUG_ON(gl, 1);
+                }
+                spin_unlock(&gl->gl_spin);
+                gfs2_glock_put(gl);
+                return;
+        }
+        /* Fast path - we got what we asked for */
+        if (test_and_clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags))
+                gfs2_demote_wake(gl);
+        if (state != LM_ST_UNLOCKED) {
+                if (glops->go_xmote_bh) {
+                        int rv;
+                        spin_unlock(&gl->gl_spin);
+                        rv = glops->go_xmote_bh(gl, gh);
+                        if (rv == -EAGAIN)
+                                return;
+                        spin_lock(&gl->gl_spin);
+                        if (rv) {
+                                do_error(gl, rv);
+                                goto out;
+                        }
+                }
+                do_promote(gl);
+        }
+out:
+        clear_bit(GLF_LOCK, &gl->gl_flags);
+        spin_unlock(&gl->gl_spin);
+        gfs2_glock_put(gl);
+}
+static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
+                                 unsigned int cur_state, unsigned int req_state,
+                                 unsigned int flags)
+{
+        int ret = LM_OUT_ERROR;
+        if (!sdp->sd_lockstruct.ls_ops->lm_lock)
+                return req_state == LM_ST_UNLOCKED ? 0 : req_state;
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state,
+                                                         req_state, flags);
+        return ret;
+}
+/**
+ * do_xmote - Calls the DLM to change the state of a lock
+ * @gl: The lock state
+ * @gh: The holder (only for promotes)
+ * @target: The target lock state
+ *
+ */
+static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target)
+{
+        const struct gfs2_glock_operations *glops = gl->gl_ops;
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        unsigned int lck_flags = gh ? gh->gh_flags : 0;
+        int ret;
+        lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
+                      LM_FLAG_PRIORITY);
+        BUG_ON(gl->gl_state == target);
+        BUG_ON(gl->gl_state == gl->gl_target);
+        if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) &&
+            glops->go_inval) {
+                set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
+                do_error(gl, 0); /* Fail queued try locks */
+        }
+        spin_unlock(&gl->gl_spin);
+        if (glops->go_xmote_th)
+                glops->go_xmote_th(gl);
+        if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
+                glops->go_inval(gl, target == LM_ST_DEFERRED ? 0 : DIO_METADATA);
+        clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
+        gfs2_glock_hold(gl);
+        if (target != LM_ST_UNLOCKED && (gl->gl_state == LM_ST_SHARED ||
+            gl->gl_state == LM_ST_DEFERRED) &&
+            !(lck_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
+                lck_flags |= LM_FLAG_TRY_1CB;
+        ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, target, lck_flags);
+        if (!(ret & LM_OUT_ASYNC)) {
+                finish_xmote(gl, ret);
+                gfs2_glock_hold(gl);
+                if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+                        gfs2_glock_put(gl);
+        } else {
+                GLOCK_BUG_ON(gl, ret != LM_OUT_ASYNC);
+        }
+        spin_lock(&gl->gl_spin);
+}
+/**
+ * find_first_holder - find the first "holder" gh
+ * @gl: the glock
+ */
+static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)
+{
+        struct gfs2_holder *gh;
+        if (!list_empty(&gl->gl_holders)) {
+                gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
+                if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+                        return gh;
+        }
+        return NULL;
+}
+/**
+ * run_queue - do all outstanding tasks related to a glock
+ * @gl: The glock in question
+ * @nonblock: True if we must not block in run_queue
+ *
+ */
+static void run_queue(struct gfs2_glock *gl, const int nonblock)
+{
+        struct gfs2_holder *gh = NULL;
+        if (test_and_set_bit(GLF_LOCK, &gl->gl_flags))
+                return;
+        GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags));
+        if (test_bit(GLF_DEMOTE, &gl->gl_flags) &&
+            gl->gl_demote_state != gl->gl_state) {
+                if (find_first_holder(gl))
+                        goto out;
+                if (nonblock)
+                        goto out_sched;
+                set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
+                GLOCK_BUG_ON(gl, gl->gl_demote_state == LM_ST_EXCLUSIVE);
+                gl->gl_target = gl->gl_demote_state;
+        } else {
+                if (test_bit(GLF_DEMOTE, &gl->gl_flags))
+                        gfs2_demote_wake(gl);
+                if (do_promote(gl) == 0)
+                        goto out;
+                gh = find_first_waiter(gl);
+                gl->gl_target = gh->gh_state;
+                if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
+                        do_error(gl, 0); /* Fail queued try locks */
+        }
+        do_xmote(gl, gh, gl->gl_target);
+        return;
+out_sched:
+        gfs2_glock_hold(gl);
+        if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+                gfs2_glock_put(gl);
+out:
+        clear_bit(GLF_LOCK, &gl->gl_flags);
+}
 static void glock_work_func(struct work_struct *work)
 {
+        unsigned long delay = 0;
        struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work);
+        if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags))
+                finish_xmote(gl, gl->gl_reply);
        spin_lock(&gl->gl_spin);
-        if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags))
+        if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
-                set_bit(GLF_DEMOTE, &gl->gl_flags);
+            gl->gl_state != LM_ST_UNLOCKED &&
-        run_queue(gl);
+            gl->gl_demote_state != LM_ST_EXCLUSIVE) {
+                unsigned long holdtime, now = jiffies;
+                holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
+                if (time_before(now, holdtime))
+                        delay = holdtime - now;
+                set_bit(delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE, &gl->gl_flags);
+        }
+        run_queue(gl, 0);
        spin_unlock(&gl->gl_spin);
-        gfs2_glock_put(gl);
+        if (!delay ||
+            queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
+                gfs2_glock_put(gl);
 }
 static int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
                     void **lockp)
 {
        int error = -EIO;
+        if (!sdp->sd_lockstruct.ls_ops->lm_get_lock)
+                return 0;
        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
                error = sdp->sd_lockstruct.ls_ops->lm_get_lock(
                                sdp->sd_lockstruct.ls_lockspace, name, lockp);
@@ -342,12 +689,10 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
        gl->gl_name = name;
        atomic_set(&gl->gl_ref, 1);
        gl->gl_state = LM_ST_UNLOCKED;
+        gl->gl_target = LM_ST_UNLOCKED;
        gl->gl_demote_state = LM_ST_EXCLUSIVE;
        gl->gl_hash = hash;
-        gl->gl_owner_pid = NULL;
-        gl->gl_ip = 0;
        gl->gl_ops = glops;
-        gl->gl_req_gh = NULL;
        gl->gl_stamp = jiffies;
        gl->gl_tchange = jiffies;
        gl->gl_object = NULL;
@@ -447,13 +792,6 @@ void gfs2_holder_uninit(struct gfs2_holder *gh)
        gh->gh_ip = 0;
 }
-static void gfs2_holder_wake(struct gfs2_holder *gh)
-{
-        clear_bit(HIF_WAIT, &gh->gh_iflags);
-        smp_mb__after_clear_bit();
-        wake_up_bit(&gh->gh_iflags, HIF_WAIT);
-}
 static int just_schedule(void *word)
 {
        schedule();
@@ -466,14 +804,6 @@ static void wait_on_holder(struct gfs2_holder *gh)
        wait_on_bit(&gh->gh_iflags, HIF_WAIT, just_schedule, TASK_UNINTERRUPTIBLE);
 }
-static void gfs2_demote_wake(struct gfs2_glock *gl)
-{
-        gl->gl_demote_state = LM_ST_EXCLUSIVE;
-        clear_bit(GLF_DEMOTE, &gl->gl_flags);
-        smp_mb__after_clear_bit();
-        wake_up_bit(&gl->gl_flags, GLF_DEMOTE);
-}
 static void wait_on_demote(struct gfs2_glock *gl)
 {
        might_sleep();
@@ -481,217 +811,6 @@ static void wait_on_demote(struct gfs2_glock *gl)
 }
 /**
- * rq_mutex - process a mutex request in the queue
- * @gh: the glock holder
- *
- * Returns: 1 if the queue is blocked
- */
-static int rq_mutex(struct gfs2_holder *gh)
-{
-        struct gfs2_glock *gl = gh->gh_gl;
-        list_del_init(&gh->gh_list);
-        /*  gh->gh_error never examined.  */
-        set_bit(GLF_LOCK, &gl->gl_flags);
-        clear_bit(HIF_WAIT, &gh->gh_iflags);
-        smp_mb();
-        wake_up_bit(&gh->gh_iflags, HIF_WAIT);
-        return 1;
-}
-/**
- * rq_promote - process a promote request in the queue
- * @gh: the glock holder
- *
- * Acquire a new inter-node lock, or change a lock state to more restrictive.
- *
- * Returns: 1 if the queue is blocked
- */
-static int rq_promote(struct gfs2_holder *gh)
-{
-        struct gfs2_glock *gl = gh->gh_gl;
-        if (!relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
-                if (list_empty(&gl->gl_holders)) {
-                        gl->gl_req_gh = gh;
-                        set_bit(GLF_LOCK, &gl->gl_flags);
-                        spin_unlock(&gl->gl_spin);
-                        gfs2_glock_xmote_th(gh->gh_gl, gh);
-                        spin_lock(&gl->gl_spin);
-                }
-                return 1;
-        }
-        if (list_empty(&gl->gl_holders)) {
-                set_bit(HIF_FIRST, &gh->gh_iflags);
-                set_bit(GLF_LOCK, &gl->gl_flags);
-        } else {
-                struct gfs2_holder *next_gh;
-                if (gh->gh_state == LM_ST_EXCLUSIVE)
-                        return 1;
-                next_gh = list_entry(gl->gl_holders.next, struct gfs2_holder,
-                                     gh_list);
-                if (next_gh->gh_state == LM_ST_EXCLUSIVE)
-                         return 1;
-        }
-        list_move_tail(&gh->gh_list, &gl->gl_holders);
-        gh->gh_error = 0;
-        set_bit(HIF_HOLDER, &gh->gh_iflags);
-        gfs2_holder_wake(gh);
-        return 0;
-}
-/**
- * rq_demote - process a demote request in the queue
- * @gh: the glock holder
- *
- * Returns: 1 if the queue is blocked
- */
-static int rq_demote(struct gfs2_glock *gl)
-{
-        if (!list_empty(&gl->gl_holders))
-                return 1;
-        if (gl->gl_state == gl->gl_demote_state ||
-            gl->gl_state == LM_ST_UNLOCKED) {
-                gfs2_demote_wake(gl);
-                return 0;
-        }
-        set_bit(GLF_LOCK, &gl->gl_flags);
-        set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
-        if (gl->gl_demote_state == LM_ST_UNLOCKED ||
-            gl->gl_state != LM_ST_EXCLUSIVE) {
-                spin_unlock(&gl->gl_spin);
-                gfs2_glock_drop_th(gl);
-        } else {
-                spin_unlock(&gl->gl_spin);
-                gfs2_glock_xmote_th(gl, NULL);
-        }
-        spin_lock(&gl->gl_spin);
-        clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
-        return 0;
-}
-/**
- * run_queue - process holder structures on a glock
- * @gl: the glock
- *
- */
-static void run_queue(struct gfs2_glock *gl)
-{
-        struct gfs2_holder *gh;
-        int blocked = 1;
-        for (;;) {
-                if (test_bit(GLF_LOCK, &gl->gl_flags))
-                        break;
-                if (!list_empty(&gl->gl_waiters1)) {
-                        gh = list_entry(gl->gl_waiters1.next,
-                                        struct gfs2_holder, gh_list);
-                        blocked = rq_mutex(gh);
-                } else if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
-                        blocked = rq_demote(gl);
-                        if (test_bit(GLF_WAITERS2, &gl->gl_flags) &&
-                                     !blocked) {
-                                set_bit(GLF_DEMOTE, &gl->gl_flags);
-                                gl->gl_demote_state = LM_ST_UNLOCKED;
-                        }
-                        clear_bit(GLF_WAITERS2, &gl->gl_flags);
-                } else if (!list_empty(&gl->gl_waiters3)) {
-                        gh = list_entry(gl->gl_waiters3.next,
-                                        struct gfs2_holder, gh_list);
-                        blocked = rq_promote(gh);
-                } else
-                        break;
-                if (blocked)
-                        break;
-        }
-}
-/**
- * gfs2_glmutex_lock - acquire a local lock on a glock
- * @gl: the glock
- *
- * Gives caller exclusive access to manipulate a glock structure.
- */
-static void gfs2_glmutex_lock(struct gfs2_glock *gl)
-{
-        spin_lock(&gl->gl_spin);
-        if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
-                struct gfs2_holder gh;
-                gfs2_holder_init(gl, 0, 0, &gh);
-                set_bit(HIF_WAIT, &gh.gh_iflags);
-                list_add_tail(&gh.gh_list, &gl->gl_waiters1);
-                spin_unlock(&gl->gl_spin);
-                wait_on_holder(&gh);
-                gfs2_holder_uninit(&gh);
-        } else {
-                gl->gl_owner_pid = get_pid(task_pid(current));
-                gl->gl_ip = (unsigned long)__builtin_return_address(0);
-                spin_unlock(&gl->gl_spin);
-        }
-}
-/**
- * gfs2_glmutex_trylock - try to acquire a local lock on a glock
- * @gl: the glock
- *
- * Returns: 1 if the glock is acquired
- */
-static int gfs2_glmutex_trylock(struct gfs2_glock *gl)
-{
-        int acquired = 1;
-        spin_lock(&gl->gl_spin);
-        if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
-                acquired = 0;
-        } else {
-                gl->gl_owner_pid = get_pid(task_pid(current));
-                gl->gl_ip = (unsigned long)__builtin_return_address(0);
-        }
-        spin_unlock(&gl->gl_spin);
-        return acquired;
-}
-/**
- * gfs2_glmutex_unlock - release a local lock on a glock
- * @gl: the glock
- *
- */
-static void gfs2_glmutex_unlock(struct gfs2_glock *gl)
-{
-        struct pid *pid;
-        spin_lock(&gl->gl_spin);
-        clear_bit(GLF_LOCK, &gl->gl_flags);
-        pid = gl->gl_owner_pid;
-        gl->gl_owner_pid = NULL;
-        gl->gl_ip = 0;
-        run_queue(gl);
-        spin_unlock(&gl->gl_spin);
-        put_pid(pid);
-}
-/**
 * handle_callback - process a demote request
 * @gl: the glock
 * @state: the state the caller wants us to change to
@@ -705,398 +824,45 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state,
 {
        int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE;
-        spin_lock(&gl->gl_spin);
        set_bit(bit, &gl->gl_flags);
        if (gl->gl_demote_state == LM_ST_EXCLUSIVE) {
                gl->gl_demote_state = state;
                gl->gl_demote_time = jiffies;
                if (remote && gl->gl_ops->go_type == LM_TYPE_IOPEN &&
-                    gl->gl_object) {
+                    gl->gl_object)
                        gfs2_glock_schedule_for_reclaim(gl);
-                        spin_unlock(&gl->gl_spin);
-                        return;
-                }
        } else if (gl->gl_demote_state != LM_ST_UNLOCKED &&
                        gl->gl_demote_state != state) {
-                if (test_bit(GLF_DEMOTE_IN_PROGRESS,  &gl->gl_flags)) 
+                gl->gl_demote_state = LM_ST_UNLOCKED;
-                        set_bit(GLF_WAITERS2, &gl->gl_flags);
-                else 
-                        gl->gl_demote_state = LM_ST_UNLOCKED;
-        }
-        spin_unlock(&gl->gl_spin);
-}
-/**
- * state_change - record that the glock is now in a different state
- * @gl: the glock
- * @new_state the new state
- *
- */
-static void state_change(struct gfs2_glock *gl, unsigned int new_state)
-{
-        int held1, held2;
-        held1 = (gl->gl_state != LM_ST_UNLOCKED);
-        held2 = (new_state != LM_ST_UNLOCKED);
-        if (held1 != held2) {
-                if (held2)
-                        gfs2_glock_hold(gl);
-                else
-                        gfs2_glock_put(gl);
        }
-        gl->gl_state = new_state;
-        gl->gl_tchange = jiffies;
 }
 /**
- * drop_bh - Called after a lock module unlock completes
+ * gfs2_glock_wait - wait on a glock acquisition
- * @gl: the glock
- * @ret: the return status
- *
- * Doesn't wake up the process waiting on the struct gfs2_holder (if any)
- * Doesn't drop the reference on the glock the top half took out
- *
- */
-static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
-{
-        struct gfs2_sbd *sdp = gl->gl_sbd;
-        struct gfs2_holder *gh = gl->gl_req_gh;
-        gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
-        gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
-        gfs2_assert_warn(sdp, !ret);
-        state_change(gl, LM_ST_UNLOCKED);
-        if (test_and_clear_bit(GLF_CONV_DEADLK, &gl->gl_flags)) {
-                spin_lock(&gl->gl_spin);
-                gh->gh_error = 0;
-                spin_unlock(&gl->gl_spin);
-                gfs2_glock_xmote_th(gl, gl->gl_req_gh);
-                gfs2_glock_put(gl);
-                return;
-        }
-        spin_lock(&gl->gl_spin);
-        gfs2_demote_wake(gl);
-        clear_bit(GLF_LOCK, &gl->gl_flags);
-        spin_unlock(&gl->gl_spin);
-        gfs2_glock_put(gl);
-}
-/**
- * xmote_bh - Called after the lock module is done acquiring a lock
- * @gl: The glock in question
- * @ret: the int returned from the lock module
- *
- */
-static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
-{
-        struct gfs2_sbd *sdp = gl->gl_sbd;
-        const struct gfs2_glock_operations *glops = gl->gl_ops;
-        struct gfs2_holder *gh = gl->gl_req_gh;
-        int op_done = 1;
-        if (!gh && (ret & LM_OUT_ST_MASK) == LM_ST_UNLOCKED) {
-                drop_bh(gl, ret);
-                return;
-        }
-        gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
-        gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
-        gfs2_assert_warn(sdp, !(ret & LM_OUT_ASYNC));
-        state_change(gl, ret & LM_OUT_ST_MASK);
-        /*  Deal with each possible exit condition  */
-        if (!gh) {
-                gl->gl_stamp = jiffies;
-                if (ret & LM_OUT_CANCELED) {
-                        op_done = 0;
-                } else {
-                        spin_lock(&gl->gl_spin);
-                        if (gl->gl_state != gl->gl_demote_state) {
-                                spin_unlock(&gl->gl_spin);
-                                gfs2_glock_drop_th(gl);
-                                gfs2_glock_put(gl);
-                                return;
-                        }
-                        gfs2_demote_wake(gl);
-                        spin_unlock(&gl->gl_spin);
-                }
-        } else {
-                spin_lock(&gl->gl_spin);
-                if (ret & LM_OUT_CONV_DEADLK) {
-                        gh->gh_error = 0;
-                        set_bit(GLF_CONV_DEADLK, &gl->gl_flags);
-                        spin_unlock(&gl->gl_spin);
-                        gfs2_glock_drop_th(gl);
-                        gfs2_glock_put(gl);
-                        return;
-                }
-                list_del_init(&gh->gh_list);
-                gh->gh_error = -EIO;
-                if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) 
-                        goto out;
-                gh->gh_error = GLR_CANCELED;
-                if (ret & LM_OUT_CANCELED) 
-                        goto out;
-                if (relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
-                        list_add_tail(&gh->gh_list, &gl->gl_holders);
-                        gh->gh_error = 0;
-                        set_bit(HIF_HOLDER, &gh->gh_iflags);
-                        set_bit(HIF_FIRST, &gh->gh_iflags);
-                        op_done = 0;
-                        goto out;
-                }
-                gh->gh_error = GLR_TRYFAILED;
-                if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))
-                        goto out;
-                gh->gh_error = -EINVAL;
-                if (gfs2_assert_withdraw(sdp, 0) == -1)
-                        fs_err(sdp, "ret = 0x%.8X\n", ret);
-out:
-                spin_unlock(&gl->gl_spin);
-        }
-        if (glops->go_xmote_bh)
-                glops->go_xmote_bh(gl);
-        if (op_done) {
-                spin_lock(&gl->gl_spin);
-                gl->gl_req_gh = NULL;
-                clear_bit(GLF_LOCK, &gl->gl_flags);
-                spin_unlock(&gl->gl_spin);
-        }
-        gfs2_glock_put(gl);
-        if (gh)
-                gfs2_holder_wake(gh);
-}
-static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
-                                 unsigned int cur_state, unsigned int req_state,
-                                 unsigned int flags)
-{
-        int ret = 0;
-        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-                ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state,
-                                                         req_state, flags);
-        return ret;
-}
-/**
- * gfs2_glock_xmote_th - Call into the lock module to acquire or change a glock
- * @gl: The glock in question
- * @state: the requested state
- * @flags: modifier flags to the lock call
- *
- */
-static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh)
-{
-        struct gfs2_sbd *sdp = gl->gl_sbd;
-        int flags = gh ? gh->gh_flags : 0;
-        unsigned state = gh ? gh->gh_state : gl->gl_demote_state;
-        const struct gfs2_glock_operations *glops = gl->gl_ops;
-        int lck_flags = flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB |
-                                 LM_FLAG_NOEXP | LM_FLAG_ANY |
-                                 LM_FLAG_PRIORITY);
-        unsigned int lck_ret;
-        if (glops->go_xmote_th)
-                glops->go_xmote_th(gl);
-        if (state == LM_ST_DEFERRED && glops->go_inval)
-                glops->go_inval(gl, DIO_METADATA);
-        gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
-        gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
-        gfs2_assert_warn(sdp, state != LM_ST_UNLOCKED);
-        gfs2_assert_warn(sdp, state != gl->gl_state);
-        gfs2_glock_hold(gl);
-        lck_ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, state, lck_flags);
-        if (gfs2_assert_withdraw(sdp, !(lck_ret & LM_OUT_ERROR)))
-                return;
-        if (lck_ret & LM_OUT_ASYNC)
-                gfs2_assert_warn(sdp, lck_ret == LM_OUT_ASYNC);
-        else
-                xmote_bh(gl, lck_ret);
-}
-static unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, void *lock,
-                                   unsigned int cur_state)
-{
-        int ret = 0;
-        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-                ret =  sdp->sd_lockstruct.ls_ops->lm_unlock(lock, cur_state);
-        return ret;
-}
-/**
- * gfs2_glock_drop_th - call into the lock module to unlock a lock
- * @gl: the glock
- *
- */
-static void gfs2_glock_drop_th(struct gfs2_glock *gl)
-{
-        struct gfs2_sbd *sdp = gl->gl_sbd;
-        const struct gfs2_glock_operations *glops = gl->gl_ops;
-        unsigned int ret;
-        if (glops->go_xmote_th)
-                glops->go_xmote_th(gl);
-        if (glops->go_inval)
-                glops->go_inval(gl, DIO_METADATA);
-        gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
-        gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
-        gfs2_assert_warn(sdp, gl->gl_state != LM_ST_UNLOCKED);
-        gfs2_glock_hold(gl);
-        ret = gfs2_lm_unlock(sdp, gl->gl_lock, gl->gl_state);
-        if (gfs2_assert_withdraw(sdp, !(ret & LM_OUT_ERROR)))
-                return;
-        if (!ret)
-                drop_bh(gl, ret);
-        else
-                gfs2_assert_warn(sdp, ret == LM_OUT_ASYNC);
-}
-/**
- * do_cancels - cancel requests for locks stuck waiting on an expire flag
- * @gh: the LM_FLAG_PRIORITY holder waiting to acquire the lock
- *
- * Don't cancel GL_NOCANCEL requests.
- */
-static void do_cancels(struct gfs2_holder *gh)
-{
-        struct gfs2_glock *gl = gh->gh_gl;
-        struct gfs2_sbd *sdp = gl->gl_sbd;
-        spin_lock(&gl->gl_spin);
-        while (gl->gl_req_gh != gh &&
-               !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
-               !list_empty(&gh->gh_list)) {
-                if (!(gl->gl_req_gh && (gl->gl_req_gh->gh_flags & GL_NOCANCEL))) {
-                        spin_unlock(&gl->gl_spin);
-                        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-                                sdp->sd_lockstruct.ls_ops->lm_cancel(gl->gl_lock);
-                        msleep(100);
-                        spin_lock(&gl->gl_spin);
-                } else {
-                        spin_unlock(&gl->gl_spin);
-                        msleep(100);
-                        spin_lock(&gl->gl_spin);
-                }
-        }
-        spin_unlock(&gl->gl_spin);
-}
-/**
- * glock_wait_internal - wait on a glock acquisition
 * @gh: the glock holder
 *
 * Returns: 0 on success
 */
-static int glock_wait_internal(struct gfs2_holder *gh)
+int gfs2_glock_wait(struct gfs2_holder *gh)
 {
-        struct gfs2_glock *gl = gh->gh_gl;
-        struct gfs2_sbd *sdp = gl->gl_sbd;
-        const struct gfs2_glock_operations *glops = gl->gl_ops;
-        if (test_bit(HIF_ABORTED, &gh->gh_iflags))
-                return -EIO;
-        if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
-                spin_lock(&gl->gl_spin);
-                if (gl->gl_req_gh != gh &&
-                    !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
-                    !list_empty(&gh->gh_list)) {
-                        list_del_init(&gh->gh_list);
-                        gh->gh_error = GLR_TRYFAILED;
-                        run_queue(gl);
-                        spin_unlock(&gl->gl_spin);
-                        return gh->gh_error;
-                }
-                spin_unlock(&gl->gl_spin);
-        }
-        if (gh->gh_flags & LM_FLAG_PRIORITY)
-                do_cancels(gh);
        wait_on_holder(gh);
-        if (gh->gh_error)
-                return gh->gh_error;
-        gfs2_assert_withdraw(sdp, test_bit(HIF_HOLDER, &gh->gh_iflags));
-        gfs2_assert_withdraw(sdp, relaxed_state_ok(gl->gl_state, gh->gh_state,
-                                                   gh->gh_flags));
-        if (test_bit(HIF_FIRST, &gh->gh_iflags)) {
-                gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
-                if (glops->go_lock) {
-                        gh->gh_error = glops->go_lock(gh);
-                        if (gh->gh_error) {
-                                spin_lock(&gl->gl_spin);
-                                list_del_init(&gh->gh_list);
-                                spin_unlock(&gl->gl_spin);
-                        }
-                }
-                spin_lock(&gl->gl_spin);
-                gl->gl_req_gh = NULL;
-                clear_bit(GLF_LOCK, &gl->gl_flags);
-                run_queue(gl);
-                spin_unlock(&gl->gl_spin);
-        }
        return gh->gh_error;
 }
-static inline struct gfs2_holder *
+void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
-find_holder_by_owner(struct list_head *head, struct pid *pid)
-{
-        struct gfs2_holder *gh;
-        list_for_each_entry(gh, head, gh_list) {
-                if (gh->gh_owner_pid == pid)
-                        return gh;
-        }
-        return NULL;
-}
-static void print_dbg(struct glock_iter *gi, const char *fmt, ...)
 {
        va_list args;
        va_start(args, fmt);
-        if (gi) {
+        if (seq) {
+                struct gfs2_glock_iter *gi = seq->private;
                vsprintf(gi->string, fmt, args);
-                seq_printf(gi->seq, gi->string);
+                seq_printf(seq, gi->string);
-        }
+        } else {
-        else
+                printk(KERN_ERR " ");
                vprintk(fmt, args);
+        }
        va_end(args);
 }
@@ -1104,50 +870,76 @@ static void print_dbg(struct glock_iter *gi, const char *fmt, ...)
 * add_to_queue - Add a holder to the wait queue (but look for recursion)
 * @gh: the holder structure to add
 *
+ * Eventually we should move the recursive locking trap to a
+ * debugging option or something like that. This is the fast
+ * path and needs to have the minimum number of distractions.
+ * 
 */
-static void add_to_queue(struct gfs2_holder *gh)
+static inline void add_to_queue(struct gfs2_holder *gh)
 {
        struct gfs2_glock *gl = gh->gh_gl;
-        struct gfs2_holder *existing;
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        struct list_head *insert_pt = NULL;
+        struct gfs2_holder *gh2;
+        int try_lock = 0;
        BUG_ON(gh->gh_owner_pid == NULL);
        if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags))
                BUG();
-        if (!(gh->gh_flags & GL_FLOCK)) {
+        if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
-                existing = find_holder_by_owner(&gl->gl_holders, 
+                if (test_bit(GLF_LOCK, &gl->gl_flags))
-                                                gh->gh_owner_pid);
+                        try_lock = 1;
-                if (existing) {
+                if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
-                        print_symbol(KERN_WARNING "original: %s\n", 
+                        goto fail;
-                                     existing->gh_ip);
+        }
-                        printk(KERN_INFO "pid : %d\n",
-                                        pid_nr(existing->gh_owner_pid));
+        list_for_each_entry(gh2, &gl->gl_holders, gh_list) {
-                        printk(KERN_INFO "lock type : %d lock state : %d\n",
+                if (unlikely(gh2->gh_owner_pid == gh->gh_owner_pid &&
-                               existing->gh_gl->gl_name.ln_type, 
+                    (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK)))
-                               existing->gh_gl->gl_state);
+                        goto trap_recursive;
-                        print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
+                if (try_lock &&
-                        printk(KERN_INFO "pid : %d\n",
+                    !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) &&
-                                        pid_nr(gh->gh_owner_pid));
+                    !may_grant(gl, gh)) {
-                        printk(KERN_INFO "lock type : %d lock state : %d\n",
+fail:
-                               gl->gl_name.ln_type, gl->gl_state);
+                        gh->gh_error = GLR_TRYFAILED;
-                        BUG();
+                        gfs2_holder_wake(gh);
-                }
+                        return;
-                
-                existing = find_holder_by_owner(&gl->gl_waiters3, 
-                                                gh->gh_owner_pid);
-                if (existing) {
-                        print_symbol(KERN_WARNING "original: %s\n", 
-                                     existing->gh_ip);
-                        print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
-                        BUG();
                }
+                if (test_bit(HIF_HOLDER, &gh2->gh_iflags))
+                        continue;
+                if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt))
+                        insert_pt = &gh2->gh_list;
+        }
+        if (likely(insert_pt == NULL)) {
+                list_add_tail(&gh->gh_list, &gl->gl_holders);
+                if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY))
+                        goto do_cancel;
+                return;
+        }
+        list_add_tail(&gh->gh_list, insert_pt);
+do_cancel:
+        gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
+        if (!(gh->gh_flags & LM_FLAG_PRIORITY)) {
+                spin_unlock(&gl->gl_spin);
+                if (sdp->sd_lockstruct.ls_ops->lm_cancel)
+                        sdp->sd_lockstruct.ls_ops->lm_cancel(gl->gl_lock);
+                spin_lock(&gl->gl_spin);
        }
+        return;
-        if (gh->gh_flags & LM_FLAG_PRIORITY)
+trap_recursive:
-                list_add(&gh->gh_list, &gl->gl_waiters3);
+        print_symbol(KERN_ERR "original: %s\n", gh2->gh_ip);
-        else
+        printk(KERN_ERR "pid: %d\n", pid_nr(gh2->gh_owner_pid));
-                list_add_tail(&gh->gh_list, &gl->gl_waiters3);
+        printk(KERN_ERR "lock type: %d req lock state : %d\n",
+               gh2->gh_gl->gl_name.ln_type, gh2->gh_state);
+        print_symbol(KERN_ERR "new: %s\n", gh->gh_ip);
+        printk(KERN_ERR "pid: %d\n", pid_nr(gh->gh_owner_pid));
+        printk(KERN_ERR "lock type: %d req lock state : %d\n",
+               gh->gh_gl->gl_name.ln_type, gh->gh_state);
+        __dump_glock(NULL, gl);
+        BUG();
 }
 /**
@@ -1165,24 +957,16 @@ int gfs2_glock_nq(struct gfs2_holder *gh)
        struct gfs2_sbd *sdp = gl->gl_sbd;
        int error = 0;
-restart:
+        if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-        if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
-                set_bit(HIF_ABORTED, &gh->gh_iflags);
                return -EIO;
-        }
        spin_lock(&gl->gl_spin);
        add_to_queue(gh);
-        run_queue(gl);
+        run_queue(gl, 1);
        spin_unlock(&gl->gl_spin);
-        if (!(gh->gh_flags & GL_ASYNC)) {
+        if (!(gh->gh_flags & GL_ASYNC))
-                error = glock_wait_internal(gh);
+                error = gfs2_glock_wait(gh);
-                if (error == GLR_CANCELED) {
-                        msleep(100);
-                        goto restart;
-                }
-        }
        return error;
 }
@@ -1196,48 +980,7 @@ restart:
 int gfs2_glock_poll(struct gfs2_holder *gh)
 {
-        struct gfs2_glock *gl = gh->gh_gl;
+        return test_bit(HIF_WAIT, &gh->gh_iflags) ? 0 : 1;
-        int ready = 0;
-        spin_lock(&gl->gl_spin);
-        if (test_bit(HIF_HOLDER, &gh->gh_iflags))
-                ready = 1;
-        else if (list_empty(&gh->gh_list)) {
-                if (gh->gh_error == GLR_CANCELED) {
-                        spin_unlock(&gl->gl_spin);
-                        msleep(100);
-                        if (gfs2_glock_nq(gh))
-                                return 1;
-                        return 0;
-                } else
-                        ready = 1;
-        }
-        spin_unlock(&gl->gl_spin);
-        return ready;
-}
-/**
- * gfs2_glock_wait - wait for a lock acquisition that ended in a GLR_ASYNC
- * @gh: the holder structure
- *
- * Returns: 0, GLR_TRYFAILED, or errno on failure
- */
-int gfs2_glock_wait(struct gfs2_holder *gh)
-{
-        int error;
-        error = glock_wait_internal(gh);
-        if (error == GLR_CANCELED) {
-                msleep(100);
-                gh->gh_flags &= ~GL_ASYNC;
-                error = gfs2_glock_nq(gh);
-        }
-        return error;
 }
 /**
@@ -1251,26 +994,30 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
        struct gfs2_glock *gl = gh->gh_gl;
        const struct gfs2_glock_operations *glops = gl->gl_ops;
        unsigned delay = 0;
+        int fast_path = 0;
+        spin_lock(&gl->gl_spin);
        if (gh->gh_flags & GL_NOCACHE)
                handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
-        gfs2_glmutex_lock(gl);
-        spin_lock(&gl->gl_spin);
        list_del_init(&gh->gh_list);
+        if (find_first_holder(gl) == NULL) {
-        if (list_empty(&gl->gl_holders)) {
                if (glops->go_unlock) {
+                        GLOCK_BUG_ON(gl, test_and_set_bit(GLF_LOCK, &gl->gl_flags));
                        spin_unlock(&gl->gl_spin);
                        glops->go_unlock(gh);
                        spin_lock(&gl->gl_spin);
+                        clear_bit(GLF_LOCK, &gl->gl_flags);
                }
                gl->gl_stamp = jiffies;
+                if (list_empty(&gl->gl_holders) &&
+                    !test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
+                    !test_bit(GLF_DEMOTE, &gl->gl_flags))
+                        fast_path = 1;
        }
-        clear_bit(GLF_LOCK, &gl->gl_flags);
        spin_unlock(&gl->gl_spin);
+        if (likely(fast_path))
+                return;
        gfs2_glock_hold(gl);
        if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
@@ -1454,6 +1201,8 @@ void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
 static int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp)
 {
        int error = -EIO;
+        if (!sdp->sd_lockstruct.ls_ops->lm_hold_lvb)
+                return 0;
        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
                error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp);
        return error;
@@ -1469,20 +1218,14 @@ int gfs2_lvb_hold(struct gfs2_glock *gl)
 {
        int error;
-        gfs2_glmutex_lock(gl);
        if (!atomic_read(&gl->gl_lvb_count)) {
                error = gfs2_lm_hold_lvb(gl->gl_sbd, gl->gl_lock, &gl->gl_lvb);
-                if (error) {
+                if (error) 
-                        gfs2_glmutex_unlock(gl);
                        return error;
-                }
                gfs2_glock_hold(gl);
        }
        atomic_inc(&gl->gl_lvb_count);
-        gfs2_glmutex_unlock(gl);
        return 0;
 }
@@ -1497,17 +1240,13 @@ void gfs2_lvb_unhold(struct gfs2_glock *gl)
        struct gfs2_sbd *sdp = gl->gl_sbd;
        gfs2_glock_hold(gl);
-        gfs2_glmutex_lock(gl);
        gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count) > 0);
        if (atomic_dec_and_test(&gl->gl_lvb_count)) {
-                if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                if (sdp->sd_lockstruct.ls_ops->lm_unhold_lvb)
                        sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(gl->gl_lock, gl->gl_lvb);
                gl->gl_lvb = NULL;
                gfs2_glock_put(gl);
        }
-        gfs2_glmutex_unlock(gl);
        gfs2_glock_put(gl);
 }
@@ -1527,7 +1266,9 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
        if (time_before(now, holdtime))
                delay = holdtime - now;
+        spin_lock(&gl->gl_spin);
        handle_callback(gl, state, 1, delay);
+        spin_unlock(&gl->gl_spin);
        if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
                gfs2_glock_put(gl);
 }
@@ -1568,7 +1309,8 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
                gl = gfs2_glock_find(sdp, &async->lc_name);
                if (gfs2_assert_warn(sdp, gl))
                        return;
-                xmote_bh(gl, async->lc_ret);
+                gl->gl_reply = async->lc_ret;
+                set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
                if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
                        gfs2_glock_put(gl);
                up_read(&gfs2_umount_flush_sem);
@@ -1581,11 +1323,6 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
                        wake_up_process(sdp->sd_recoverd_process);
                return;
-        case LM_CB_DROPLOCKS:
-                gfs2_gl_hash_clear(sdp, NO_WAIT);
-                gfs2_quota_scan(sdp);
-                return;
        default:
                gfs2_assert_warn(sdp, 0);
                return;
@@ -1646,6 +1383,7 @@ void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
 void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
 {
        struct gfs2_glock *gl;
+        int done_callback = 0;
        spin_lock(&sdp->sd_reclaim_lock);
        if (list_empty(&sdp->sd_reclaim_list)) {
@@ -1660,14 +1398,16 @@ void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
        atomic_dec(&sdp->sd_reclaim_count);
        atomic_inc(&sdp->sd_reclaimed);
-        if (gfs2_glmutex_trylock(gl)) {
+        spin_lock(&gl->gl_spin);
-                if (list_empty(&gl->gl_holders) &&
+        if (find_first_holder(gl) == NULL &&
-                    gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
+            gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) {
-                        handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
+                handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
-                gfs2_glmutex_unlock(gl);
+                done_callback = 1;
        }
+        spin_unlock(&gl->gl_spin);
-        gfs2_glock_put(gl);
+        if (!done_callback ||
+            queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+                gfs2_glock_put(gl);
 }
 /**
@@ -1724,18 +1464,14 @@ static void scan_glock(struct gfs2_glock *gl)
 {
        if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object)
                return;
+        if (test_bit(GLF_LOCK, &gl->gl_flags))
+                return;
-        if (gfs2_glmutex_trylock(gl)) {
+        spin_lock(&gl->gl_spin);
-                if (list_empty(&gl->gl_holders) &&
+        if (find_first_holder(gl) == NULL &&
-                    gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
+            gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
-                        goto out_schedule;
+                gfs2_glock_schedule_for_reclaim(gl);
-                gfs2_glmutex_unlock(gl);
+        spin_unlock(&gl->gl_spin);
-        }
-        return;
-out_schedule:
-        gfs2_glmutex_unlock(gl);
-        gfs2_glock_schedule_for_reclaim(gl);
 }
 /**
@@ -1760,12 +1496,13 @@ static void clear_glock(struct gfs2_glock *gl)
                spin_unlock(&sdp->sd_reclaim_lock);
        }
-        if (gfs2_glmutex_trylock(gl)) {
+        spin_lock(&gl->gl_spin);
-                if (list_empty(&gl->gl_holders) &&
+        if (find_first_holder(gl) == NULL && gl->gl_state != LM_ST_UNLOCKED)
-                    gl->gl_state != LM_ST_UNLOCKED)
+                handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
-                        handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
+        spin_unlock(&gl->gl_spin);
-                gfs2_glmutex_unlock(gl);
+        gfs2_glock_hold(gl);
-        }
+        if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+                gfs2_glock_put(gl);
 }
 /**
@@ -1773,11 +1510,10 @@ static void clear_glock(struct gfs2_glock *gl)
 * @sdp: the filesystem
 * @wait: wait until it's all gone
 *
- * Called when unmounting the filesystem, or when inter-node lock manager
+ * Called when unmounting the filesystem.
- * requests DROPLOCKS because it is running out of capacity.
 */
-void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
+void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
 {
        unsigned long t;
        unsigned int x;
@@ -1792,7 +1528,7 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
                                cont = 1;
                }
-                if (!wait || !cont)
+                if (!cont)
                        break;
                if (time_after_eq(jiffies,
@@ -1810,180 +1546,164 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
        }
 }
-/*
+static const char *state2str(unsigned state)
- *  Diagnostic routines to help debug distributed deadlock
- */
-static void gfs2_print_symbol(struct glock_iter *gi, const char *fmt,
-                              unsigned long address)
 {
-        char buffer[KSYM_SYMBOL_LEN];
+        switch(state) {
+        case LM_ST_UNLOCKED:
-        sprint_symbol(buffer, address);
+                return "UN";
-        print_dbg(gi, fmt, buffer);
+        case LM_ST_SHARED:
+                return "SH";
+        case LM_ST_DEFERRED:
+                return "DF";
+        case LM_ST_EXCLUSIVE:
+                return "EX";
+        }
+        return "??";
+}
+static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
+{
+        char *p = buf;
+        if (flags & LM_FLAG_TRY)
+                *p++ = 't';
+        if (flags & LM_FLAG_TRY_1CB)
+                *p++ = 'T';
+        if (flags & LM_FLAG_NOEXP)
+                *p++ = 'e';
+        if (flags & LM_FLAG_ANY)
+                *p++ = 'a';
+        if (flags & LM_FLAG_PRIORITY)
+                *p++ = 'p';
+        if (flags & GL_ASYNC)
+                *p++ = 'a';
+        if (flags & GL_EXACT)
+                *p++ = 'E';
+        if (flags & GL_ATIME)
+                *p++ = 'a';
+        if (flags & GL_NOCACHE)
+                *p++ = 'c';
+        if (test_bit(HIF_HOLDER, &iflags))
+                *p++ = 'H';
+        if (test_bit(HIF_WAIT, &iflags))
+                *p++ = 'W';
+        if (test_bit(HIF_FIRST, &iflags))
+                *p++ = 'F';
+        *p = 0;
+        return buf;
 }
 /**
 * dump_holder - print information about a glock holder
- * @str: a string naming the type of holder
+ * @seq: the seq_file struct
 * @gh: the glock holder
 *
 * Returns: 0 on success, -ENOBUFS when we run out of space
 */
-static int dump_holder(struct glock_iter *gi, char *str,
+static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
-                       struct gfs2_holder *gh)
 {
-        unsigned int x;
+        struct task_struct *gh_owner = NULL;
-        struct task_struct *gh_owner;
+        char buffer[KSYM_SYMBOL_LEN];
+        char flags_buf[32];
-        print_dbg(gi, "  %s\n", str);
+        sprint_symbol(buffer, gh->gh_ip);
-        if (gh->gh_owner_pid) {
+        if (gh->gh_owner_pid)
-                print_dbg(gi, "    owner = %ld ",
-                                (long)pid_nr(gh->gh_owner_pid));
                gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID);
-                if (gh_owner)
+        gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %s\n",
-                        print_dbg(gi, "(%s)\n", gh_owner->comm);
+                  state2str(gh->gh_state),
-                else
+                  hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags),
-                        print_dbg(gi, "(ended)\n");
+                  gh->gh_error, 
-        } else
+                  gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
-                print_dbg(gi, "    owner = -1\n");
+                  gh_owner ? gh_owner->comm : "(ended)", buffer);
-        print_dbg(gi, "    gh_state = %u\n", gh->gh_state);
-        print_dbg(gi, "    gh_flags =");
-        for (x = 0; x < 32; x++)
-                if (gh->gh_flags & (1 << x))
-                        print_dbg(gi, " %u", x);
-        print_dbg(gi, " \n");
-        print_dbg(gi, "    error = %d\n", gh->gh_error);
-        print_dbg(gi, "    gh_iflags =");
-        for (x = 0; x < 32; x++)
-                if (test_bit(x, &gh->gh_iflags))
-                        print_dbg(gi, " %u", x);
-        print_dbg(gi, " \n");
-        gfs2_print_symbol(gi, "    initialized at: %s\n", gh->gh_ip);
        return 0;
 }
-/**
+static const char *gflags2str(char *buf, const unsigned long *gflags)
- * dump_inode - print information about an inode
+{
- * @ip: the inode
+        char *p = buf;
- *
+        if (test_bit(GLF_LOCK, gflags))
- * Returns: 0 on success, -ENOBUFS when we run out of space
+                *p++ = 'l';
- */
+        if (test_bit(GLF_STICKY, gflags))
+                *p++ = 's';
-static int dump_inode(struct glock_iter *gi, struct gfs2_inode *ip)
+        if (test_bit(GLF_DEMOTE, gflags))
-{
+                *p++ = 'D';
-        unsigned int x;
+        if (test_bit(GLF_PENDING_DEMOTE, gflags))
+                *p++ = 'd';
-        print_dbg(gi, "  Inode:\n");
+        if (test_bit(GLF_DEMOTE_IN_PROGRESS, gflags))
-        print_dbg(gi, "    num = %llu/%llu\n",
+                *p++ = 'p';
-                  (unsigned long long)ip->i_no_formal_ino,
+        if (test_bit(GLF_DIRTY, gflags))
-                  (unsigned long long)ip->i_no_addr);
+                *p++ = 'y';
-        print_dbg(gi, "    type = %u\n", IF2DT(ip->i_inode.i_mode));
+        if (test_bit(GLF_LFLUSH, gflags))
-        print_dbg(gi, "    i_flags =");
+                *p++ = 'f';
-        for (x = 0; x < 32; x++)
+        if (test_bit(GLF_INVALIDATE_IN_PROGRESS, gflags))
-                if (test_bit(x, &ip->i_flags))
+                *p++ = 'i';
-                        print_dbg(gi, " %u", x);
+        if (test_bit(GLF_REPLY_PENDING, gflags))
-        print_dbg(gi, " \n");
+                *p++ = 'r';
-        return 0;
+        *p = 0;
+        return buf;
 }
 /**
- * dump_glock - print information about a glock
+ * __dump_glock - print information about a glock
+ * @seq: The seq_file struct
 * @gl: the glock
- * @count: where we are in the buffer
+ *
+ * The file format is as follows:
+ * One line per object, capital letters are used to indicate objects
+ * G = glock, I = Inode, R = rgrp, H = holder. Glocks are not indented,
+ * other objects are indented by a single space and follow the glock to
+ * which they are related. Fields are indicated by lower case letters
+ * followed by a colon and the field value, except for strings which are in
+ * [] so that its possible to see if they are composed of spaces for
+ * example. The field's are n = number (id of the object), f = flags,
+ * t = type, s = state, r = refcount, e = error, p = pid.
 *
 * Returns: 0 on success, -ENOBUFS when we run out of space
 */
-static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl)
+static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
 {
-        struct gfs2_holder *gh;
+        const struct gfs2_glock_operations *glops = gl->gl_ops;
-        unsigned int x;
+        unsigned long long dtime;
-        int error = -ENOBUFS;
+        const struct gfs2_holder *gh;
-        struct task_struct *gl_owner;
+        char gflags_buf[32];
+        int error = 0;
-        spin_lock(&gl->gl_spin);
+        dtime = jiffies - gl->gl_demote_time;
+        dtime *= 1000000/HZ; /* demote time in uSec */
+        if (!test_bit(GLF_DEMOTE, &gl->gl_flags))
+                dtime = 0;
+        gfs2_print_dbg(seq, "G:  s:%s n:%u/%llu f:%s t:%s d:%s/%llu l:%d a:%d r:%d\n",
+                  state2str(gl->gl_state),
+                  gl->gl_name.ln_type,
+                  (unsigned long long)gl->gl_name.ln_number,
+                  gflags2str(gflags_buf, &gl->gl_flags),
+                  state2str(gl->gl_target),
+                  state2str(gl->gl_demote_state), dtime,
+                  atomic_read(&gl->gl_lvb_count),
+                  atomic_read(&gl->gl_ail_count),
+                  atomic_read(&gl->gl_ref));
-        print_dbg(gi, "Glock 0x%p (%u, 0x%llx)\n", gl, gl->gl_name.ln_type,
-                   (unsigned long long)gl->gl_name.ln_number);
-        print_dbg(gi, "  gl_flags =");
-        for (x = 0; x < 32; x++) {
-                if (test_bit(x, &gl->gl_flags))
-                        print_dbg(gi, " %u", x);
-        }
-        if (!test_bit(GLF_LOCK, &gl->gl_flags))
-                print_dbg(gi, " (unlocked)");
-        print_dbg(gi, " \n");
-        print_dbg(gi, "  gl_ref = %d\n", atomic_read(&gl->gl_ref));
-        print_dbg(gi, "  gl_state = %u\n", gl->gl_state);
-        if (gl->gl_owner_pid) {
-                gl_owner = pid_task(gl->gl_owner_pid, PIDTYPE_PID);
-                if (gl_owner)
-                        print_dbg(gi, "  gl_owner = pid %d (%s)\n",
-                                  pid_nr(gl->gl_owner_pid), gl_owner->comm);
-                else
-                        print_dbg(gi, "  gl_owner = %d (ended)\n",
-                                  pid_nr(gl->gl_owner_pid));
-        } else
-                print_dbg(gi, "  gl_owner = -1\n");
-        print_dbg(gi, "  gl_ip = %lu\n", gl->gl_ip);
-        print_dbg(gi, "  req_gh = %s\n", (gl->gl_req_gh) ? "yes" : "no");
-        print_dbg(gi, "  lvb_count = %d\n", atomic_read(&gl->gl_lvb_count));
-        print_dbg(gi, "  object = %s\n", (gl->gl_object) ? "yes" : "no");
-        print_dbg(gi, "  reclaim = %s\n",
-                   (list_empty(&gl->gl_reclaim)) ? "no" : "yes");
-        if (gl->gl_aspace)
-                print_dbg(gi, "  aspace = 0x%p nrpages = %lu\n", gl->gl_aspace,
-                           gl->gl_aspace->i_mapping->nrpages);
-        else
-                print_dbg(gi, "  aspace = no\n");
-        print_dbg(gi, "  ail = %d\n", atomic_read(&gl->gl_ail_count));
-        if (gl->gl_req_gh) {
-                error = dump_holder(gi, "Request", gl->gl_req_gh);
-                if (error)
-                        goto out;
-        }
        list_for_each_entry(gh, &gl->gl_holders, gh_list) {
-                error = dump_holder(gi, "Holder", gh);
+                error = dump_holder(seq, gh);
                if (error)
                        goto out;
        }
-        list_for_each_entry(gh, &gl->gl_waiters1, gh_list) {
+        if (gl->gl_state != LM_ST_UNLOCKED && glops->go_dump)
-                error = dump_holder(gi, "Waiter1", gh);
+                error = glops->go_dump(seq, gl);
-                if (error)
-                        goto out;
-        }
-        list_for_each_entry(gh, &gl->gl_waiters3, gh_list) {
-                error = dump_holder(gi, "Waiter3", gh);
-                if (error)
-                        goto out;
-        }
-        if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
-                print_dbg(gi, "  Demotion req to state %u (%llu uS ago)\n",
-                          gl->gl_demote_state, (unsigned long long)
-                          (jiffies - gl->gl_demote_time)*(1000000/HZ));
-        }
-        if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object) {
-                if (!test_bit(GLF_LOCK, &gl->gl_flags) &&
-                        list_empty(&gl->gl_holders)) {
-                        error = dump_inode(gi, gl->gl_object);
-                        if (error)
-                                goto out;
-                } else {
-                        error = -ENOBUFS;
-                        print_dbg(gi, "  Inode: busy\n");
-                }
-        }
-        error = 0;
 out:
-        spin_unlock(&gl->gl_spin);
        return error;
 }
+static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
+{
+        int ret;
+        spin_lock(&gl->gl_spin);
+        ret = __dump_glock(seq, gl);
+        spin_unlock(&gl->gl_spin);
+        return ret;
+}
 /**
 * gfs2_dump_lockstate - print out the current lockstate
 * @sdp: the filesystem
@@ -2086,7 +1806,7 @@ void gfs2_glock_exit(void)
 module_param(scand_secs, uint, S_IRUGO|S_IWUSR);
 MODULE_PARM_DESC(scand_secs, "The number of seconds between scand runs");
-static int gfs2_glock_iter_next(struct glock_iter *gi)
+static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
 {
        struct gfs2_glock *gl;
@@ -2104,7 +1824,7 @@ restart:
                gfs2_glock_put(gl);
        if (gl && gi->gl == NULL)
                gi->hash++;
-        while(gi->gl == NULL) {
+        while (gi->gl == NULL) {
                if (gi->hash >= GFS2_GL_HASH_SIZE)
                        return 1;
                read_lock(gl_lock_addr(gi->hash));
@@ -2122,58 +1842,34 @@ restart:
        return 0;
 }
-static void gfs2_glock_iter_free(struct glock_iter *gi)
+static void gfs2_glock_iter_free(struct gfs2_glock_iter *gi)
 {
        if (gi->gl)
                gfs2_glock_put(gi->gl);
-        kfree(gi);
-}
-static struct glock_iter *gfs2_glock_iter_init(struct gfs2_sbd *sdp)
-{
-        struct glock_iter *gi;
-        gi = kmalloc(sizeof (*gi), GFP_KERNEL);
-        if (!gi)
-                return NULL;
-        gi->sdp = sdp;
-        gi->hash = 0;
-        gi->seq = NULL;
        gi->gl = NULL;
-        memset(gi->string, 0, sizeof(gi->string));
-        if (gfs2_glock_iter_next(gi)) {
-                gfs2_glock_iter_free(gi);
-                return NULL;
-        }
-        return gi;
 }
-static void *gfs2_glock_seq_start(struct seq_file *file, loff_t *pos)
+static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos)
 {
-        struct glock_iter *gi;
+        struct gfs2_glock_iter *gi = seq->private;
        loff_t n = *pos;
-        gi = gfs2_glock_iter_init(file->private);
+        gi->hash = 0;
-        if (!gi)
-                return NULL;
-        while(n--) {
+        do {
                if (gfs2_glock_iter_next(gi)) {
                        gfs2_glock_iter_free(gi);
                        return NULL;
                }
-        }
+        } while (n--);
-        return gi;
+        return gi->gl;
 }
-static void *gfs2_glock_seq_next(struct seq_file *file, void *iter_ptr,
+static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr,
                                 loff_t *pos)
 {
-        struct glock_iter *gi = iter_ptr;
+        struct gfs2_glock_iter *gi = seq->private;
        (*pos)++;
@@ -2182,24 +1878,18 @@ static void *gfs2_glock_seq_next(struct seq_file *file, void *iter_ptr,
                return NULL;
        }
-        return gi;
+        return gi->gl;
 }
-static void gfs2_glock_seq_stop(struct seq_file *file, void *iter_ptr)
+static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr)
 {
-        struct glock_iter *gi = iter_ptr;
+        struct gfs2_glock_iter *gi = seq->private;
-        if (gi)
+        gfs2_glock_iter_free(gi);
-                gfs2_glock_iter_free(gi);
 }
-static int gfs2_glock_seq_show(struct seq_file *file, void *iter_ptr)
+static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr)
 {
-        struct glock_iter *gi = iter_ptr;
+        return dump_glock(seq, iter_ptr);
-        gi->seq = file;
-        dump_glock(gi, gi->gl);
-        return 0;
 }
 static const struct seq_operations gfs2_glock_seq_ops = {
@@ -2211,17 +1901,14 @@ static const struct seq_operations gfs2_glock_seq_ops = {
 static int gfs2_debugfs_open(struct inode *inode, struct file *file)
 {
-        struct seq_file *seq;
+        int ret = seq_open_private(file, &gfs2_glock_seq_ops,
-        int ret;
+                                   sizeof(struct gfs2_glock_iter));
+        if (ret == 0) {
-        ret = seq_open(file, &gfs2_glock_seq_ops);
+                struct seq_file *seq = file->private_data;
-        if (ret)
+                struct gfs2_glock_iter *gi = seq->private;
-                return ret;
+                gi->sdp = inode->i_private;
+        }
-        seq = file->private_data;
+        return ret;
-        seq->private = inode->i_private;
-        return 0;
 }
 static const struct file_operations gfs2_debug_fops = {
@@ -2229,7 +1916,7 @@ static const struct file_operations gfs2_debug_fops = {
        .open    = gfs2_debugfs_open,
        .read    = seq_read,
        .llseek  = seq_lseek,
-        .release = seq_release
+        .release = seq_release_private,
 };
 int gfs2_create_debugfs_file(struct gfs2_sbd *sdp)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index cdad3e6f8150..971d92af70fc 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -26,11 +26,8 @@
 #define GL_SKIP                 0x00000100
 #define GL_ATIME                0x00000200
 #define GL_NOCACHE              0x00000400
-#define GL_FLOCK                0x00000800
-#define GL_NOCANCEL             0x00001000
 #define GLR_TRYFAILED           13
-#define GLR_CANCELED            14
 static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
 {
@@ -41,6 +38,8 @@ static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *
        spin_lock(&gl->gl_spin);
        pid = task_pid(current);
        list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+                if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
+                        break;
                if (gh->gh_owner_pid == pid)
                        goto out;
        }
@@ -70,7 +69,7 @@ static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
 {
        int ret;
        spin_lock(&gl->gl_spin);
-        ret = test_bit(GLF_DEMOTE, &gl->gl_flags) || !list_empty(&gl->gl_waiters3);
+        ret = test_bit(GLF_DEMOTE, &gl->gl_flags);
        spin_unlock(&gl->gl_spin);
        return ret;
 }
@@ -98,6 +97,7 @@ int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
 int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
 void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
 void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
+void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
 /**
 * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock
@@ -130,10 +130,9 @@ int gfs2_lvb_hold(struct gfs2_glock *gl);
 void gfs2_lvb_unhold(struct gfs2_glock *gl);
 void gfs2_glock_cb(void *cb_data, unsigned int type, void *data);
 void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
 void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
-void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait);
+void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
 int __init gfs2_glock_init(void);
 void gfs2_glock_exit(void);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 07d84d16cda4..c6c318c2a0f6 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -13,6 +13,7 @@
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/lm_interface.h>
+#include <linux/bio.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -172,26 +173,6 @@ static void inode_go_sync(struct gfs2_glock *gl)
 }
 /**
- * inode_go_xmote_bh - After promoting/demoting a glock
- * @gl: the glock
- *
- */
-static void inode_go_xmote_bh(struct gfs2_glock *gl)
-{
-        struct gfs2_holder *gh = gl->gl_req_gh;
-        struct buffer_head *bh;
-        int error;
-        if (gl->gl_state != LM_ST_UNLOCKED &&
-            (!gh || !(gh->gh_flags & GL_SKIP))) {
-                error = gfs2_meta_read(gl, gl->gl_name.ln_number, 0, &bh);
-                if (!error)
-                        brelse(bh);
-        }
-}
-/**
 * inode_go_inval - prepare a inode glock to be released
 * @gl: the glock
 * @flags:
@@ -267,6 +248,26 @@ static int inode_go_lock(struct gfs2_holder *gh)
 }
 /**
+ * inode_go_dump - print information about an inode
+ * @seq: The iterator
+ * @ip: the inode
+ *
+ * Returns: 0 on success, -ENOBUFS when we run out of space
+ */
+static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
+{
+        const struct gfs2_inode *ip = gl->gl_object;
+        if (ip == NULL)
+                return 0;
+        gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%08lx\n",
+                  (unsigned long long)ip->i_no_formal_ino,
+                  (unsigned long long)ip->i_no_addr,
+                  IF2DT(ip->i_inode.i_mode), ip->i_flags);
+        return 0;
+}
+/**
 * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock
 * @gl: the glock
 *
@@ -306,6 +307,22 @@ static void rgrp_go_unlock(struct gfs2_holder *gh)
 }
 /**
+ * rgrp_go_dump - print out an rgrp
+ * @seq: The iterator
+ * @gl: The glock in question
+ *
+ */
+static int rgrp_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
+{
+        const struct gfs2_rgrpd *rgd = gl->gl_object;
+        if (rgd == NULL)
+                return 0;
+        gfs2_print_dbg(seq, " R: n:%llu\n", (unsigned long long)rgd->rd_addr);
+        return 0;
+}
+/**
 * trans_go_sync - promote/demote the transaction glock
 * @gl: the glock
 * @state: the requested state
@@ -330,7 +347,7 @@ static void trans_go_sync(struct gfs2_glock *gl)
 *
 */
-static void trans_go_xmote_bh(struct gfs2_glock *gl)
+static int trans_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh)
 {
        struct gfs2_sbd *sdp = gl->gl_sbd;
        struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
@@ -338,8 +355,7 @@ static void trans_go_xmote_bh(struct gfs2_glock *gl)
        struct gfs2_log_header_host head;
        int error;
-        if (gl->gl_state != LM_ST_UNLOCKED &&
+        if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
-            test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
                j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
                error = gfs2_find_jhead(sdp->sd_jdesc, &head);
@@ -354,6 +370,7 @@ static void trans_go_xmote_bh(struct gfs2_glock *gl)
                        gfs2_log_pointers_init(sdp, head.lh_blkno);
                }
        }
+        return 0;
 }
 /**
@@ -375,12 +392,12 @@ const struct gfs2_glock_operations gfs2_meta_glops = {
 const struct gfs2_glock_operations gfs2_inode_glops = {
        .go_xmote_th = inode_go_sync,
-        .go_xmote_bh = inode_go_xmote_bh,
        .go_inval = inode_go_inval,
        .go_demote_ok = inode_go_demote_ok,
        .go_lock = inode_go_lock,
+        .go_dump = inode_go_dump,
        .go_type = LM_TYPE_INODE,
-        .go_min_hold_time = HZ / 10,
+        .go_min_hold_time = HZ / 5,
 };
 const struct gfs2_glock_operations gfs2_rgrp_glops = {
@@ -389,8 +406,9 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
        .go_demote_ok = rgrp_go_demote_ok,
        .go_lock = rgrp_go_lock,
        .go_unlock = rgrp_go_unlock,
+        .go_dump = rgrp_go_dump,
        .go_type = LM_TYPE_RGRP,
-        .go_min_hold_time = HZ / 10,
+        .go_min_hold_time = HZ / 5,
 };
 const struct gfs2_glock_operations gfs2_trans_glops = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index eabe5eac41da..448697a5c462 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -77,7 +77,6 @@ struct gfs2_rgrp_host {
 struct gfs2_rgrpd {
        struct list_head rd_list;       /* Link with superblock */
        struct list_head rd_list_mru;
-        struct list_head rd_recent;     /* Recently used rgrps */
        struct gfs2_glock *rd_gl;       /* Glock for this rgrp */
        u64 rd_addr;                    /* grp block disk address */
        u64 rd_data0;                   /* first data location */
@@ -128,20 +127,20 @@ struct gfs2_bufdata {
 struct gfs2_glock_operations {
        void (*go_xmote_th) (struct gfs2_glock *gl);
-        void (*go_xmote_bh) (struct gfs2_glock *gl);
+        int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh);
        void (*go_inval) (struct gfs2_glock *gl, int flags);
        int (*go_demote_ok) (struct gfs2_glock *gl);
        int (*go_lock) (struct gfs2_holder *gh);
        void (*go_unlock) (struct gfs2_holder *gh);
+        int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
        const int go_type;
        const unsigned long go_min_hold_time;
 };
 enum {
        /* States */
-        HIF_HOLDER              = 6,
+        HIF_HOLDER              = 6,  /* Set for gh that "holds" the glock */
        HIF_FIRST               = 7,
-        HIF_ABORTED             = 9,
        HIF_WAIT                = 10,
 };
@@ -154,20 +153,20 @@ struct gfs2_holder {
        unsigned gh_flags;
        int gh_error;
-        unsigned long gh_iflags;
+        unsigned long gh_iflags; /* HIF_... */
        unsigned long gh_ip;
 };
 enum {
-        GLF_LOCK                = 1,
+        GLF_LOCK                        = 1,
-        GLF_STICKY              = 2,
+        GLF_STICKY                      = 2,
-        GLF_DEMOTE              = 3,
+        GLF_DEMOTE                      = 3,
-        GLF_PENDING_DEMOTE      = 4,
+        GLF_PENDING_DEMOTE              = 4,
-        GLF_DIRTY               = 5,
+        GLF_DEMOTE_IN_PROGRESS          = 5,
-        GLF_DEMOTE_IN_PROGRESS  = 6,
+        GLF_DIRTY                       = 6,
-        GLF_LFLUSH              = 7,
+        GLF_LFLUSH                      = 7,
-        GLF_WAITERS2            = 8,
+        GLF_INVALIDATE_IN_PROGRESS      = 8,
-        GLF_CONV_DEADLK         = 9,
+        GLF_REPLY_PENDING               = 9,
 };
 struct gfs2_glock {
@@ -179,19 +178,14 @@ struct gfs2_glock {
        spinlock_t gl_spin;
        unsigned int gl_state;
+        unsigned int gl_target;
+        unsigned int gl_reply;
        unsigned int gl_hash;
        unsigned int gl_demote_state; /* state requested by remote node */
        unsigned long gl_demote_time; /* time of first demote request */
-        struct pid *gl_owner_pid;
-        unsigned long gl_ip;
        struct list_head gl_holders;
-        struct list_head gl_waiters1;   /* HIF_MUTEX */
-        struct list_head gl_waiters3;   /* HIF_PROMOTE */
        const struct gfs2_glock_operations *gl_ops;
-        struct gfs2_holder *gl_req_gh;
        void *gl_lock;
        char *gl_lvb;
        atomic_t gl_lvb_count;
@@ -427,7 +421,6 @@ struct gfs2_tune {
        unsigned int gt_quota_quantum; /* Secs between syncs to quota file */
        unsigned int gt_atime_quantum; /* Min secs between atime updates */
        unsigned int gt_new_files_jdata;
-        unsigned int gt_new_files_directio;
        unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
        unsigned int gt_stall_secs; /* Detects trouble! */
        unsigned int gt_complain_secs;
@@ -534,7 +527,6 @@ struct gfs2_sbd {
        struct mutex sd_rindex_mutex;
        struct list_head sd_rindex_list;
        struct list_head sd_rindex_mru_list;
-        struct list_head sd_rindex_recent_list;
        struct gfs2_rgrpd *sd_rindex_forward;
        unsigned int sd_rgrps;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 09453d057e41..6da0ab355b8a 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -504,7 +504,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
        }
        if (!is_root) {
-                error = permission(dir, MAY_EXEC, NULL);
+                error = gfs2_permission(dir, MAY_EXEC);
                if (error)
                        goto out;
        }
@@ -667,7 +667,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
 {
        int error;
-        error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL);
+        error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
        if (error)
                return error;
@@ -789,13 +789,8 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
                if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_JDATA) ||
                    gfs2_tune_get(sdp, gt_new_files_jdata))
                        di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
-                if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_DIRECTIO) ||
-                    gfs2_tune_get(sdp, gt_new_files_directio))
-                        di->di_flags |= cpu_to_be32(GFS2_DIF_DIRECTIO);
        } else if (S_ISDIR(mode)) {
                di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
-                                            GFS2_DIF_INHERIT_DIRECTIO);
-                di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
                                            GFS2_DIF_INHERIT_JDATA);
        }
@@ -1134,7 +1129,7 @@ int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
        if (IS_APPEND(&dip->i_inode))
                return -EPERM;
-        error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL);
+        error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
        if (error)
                return error;
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 580da454b38f..6074c2506f75 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -72,7 +72,6 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
 }
-void gfs2_inode_attr_in(struct gfs2_inode *ip);
 void gfs2_set_iop(struct inode *inode);
 struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
                                u64 no_addr, u64 no_formal_ino,
@@ -91,6 +90,7 @@ int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
                struct gfs2_inode *ip);
 int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
                   const struct gfs2_inode *ip);
+int gfs2_permission(struct inode *inode, int mask);
 int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to);
 int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len);
 int gfs2_glock_nq_atime(struct gfs2_holder *gh);
diff --git a/fs/gfs2/locking.c b/fs/gfs2/locking.c
index 663fee728783..523243a13a21 100644
--- a/fs/gfs2/locking.c
+++ b/fs/gfs2/locking.c
@@ -23,12 +23,54 @@ struct lmh_wrapper {
        const struct lm_lockops *lw_ops;
 };
+static int nolock_mount(char *table_name, char *host_data,
+                        lm_callback_t cb, void *cb_data,
+                        unsigned int min_lvb_size, int flags,
+                        struct lm_lockstruct *lockstruct,
+                        struct kobject *fskobj);
 /* List of registered low-level locking protocols.  A file system selects one
   of them by name at mount time, e.g. lock_nolock, lock_dlm. */
+static const struct lm_lockops nolock_ops = {
+        .lm_proto_name = "lock_nolock",
+        .lm_mount = nolock_mount,
+};
+static struct lmh_wrapper nolock_proto  = {
+        .lw_list = LIST_HEAD_INIT(nolock_proto.lw_list),
+        .lw_ops = &nolock_ops,
+};
 static LIST_HEAD(lmh_list);
 static DEFINE_MUTEX(lmh_lock);
+static int nolock_mount(char *table_name, char *host_data,
+                        lm_callback_t cb, void *cb_data,
+                        unsigned int min_lvb_size, int flags,
+                        struct lm_lockstruct *lockstruct,
+                        struct kobject *fskobj)
+{
+        char *c;
+        unsigned int jid;
+        c = strstr(host_data, "jid=");
+        if (!c)
+                jid = 0;
+        else {
+                c += 4;
+                sscanf(c, "%u", &jid);
+        }
+        lockstruct->ls_jid = jid;
+        lockstruct->ls_first = 1;
+        lockstruct->ls_lvb_size = min_lvb_size;
+        lockstruct->ls_ops = &nolock_ops;
+        lockstruct->ls_flags = LM_LSFLAG_LOCAL;
+        return 0;
+}
 /**
 * gfs2_register_lockproto - Register a low-level locking protocol
 * @proto: the protocol definition
@@ -116,9 +158,13 @@ int gfs2_mount_lockproto(char *proto_name, char *table_name, char *host_data,
        int try = 0;
        int error, found;
 retry:
        mutex_lock(&lmh_lock);
+        if (list_empty(&nolock_proto.lw_list))
+                list_add(&nolock_proto.lw_list, &lmh_list);
        found = 0;
        list_for_each_entry(lw, &lmh_list, lw_list) {
                if (!strcmp(lw->lw_ops->lm_proto_name, proto_name)) {
@@ -139,7 +185,8 @@ retry:
                goto out;
        }
-        if (!try_module_get(lw->lw_ops->lm_owner)) {
+        if (lw->lw_ops->lm_owner &&
+            !try_module_get(lw->lw_ops->lm_owner)) {
                try = 0;
                mutex_unlock(&lmh_lock);
                msleep(1000);
@@ -158,7 +205,8 @@ out:
 void gfs2_unmount_lockproto(struct lm_lockstruct *lockstruct)
 {
        mutex_lock(&lmh_lock);
-        lockstruct->ls_ops->lm_unmount(lockstruct->ls_lockspace);
+        if (lockstruct->ls_ops->lm_unmount)
+                lockstruct->ls_ops->lm_unmount(lockstruct->ls_lockspace);
        if (lockstruct->ls_ops->lm_owner)
                module_put(lockstruct->ls_ops->lm_owner);
        mutex_unlock(&lmh_lock);
diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c
index cf7ea8abec87..2482c9047505 100644
--- a/fs/gfs2/locking/dlm/lock.c
+++ b/fs/gfs2/locking/dlm/lock.c
@@ -11,46 +11,60 @@
 static char junk_lvb[GDLM_LVB_SIZE];
-static void queue_complete(struct gdlm_lock *lp)
+/* convert dlm lock-mode to gfs lock-state */
+static s16 gdlm_make_lmstate(s16 dlmmode)
 {
-        struct gdlm_ls *ls = lp->ls;
+        switch (dlmmode) {
+        case DLM_LOCK_IV:
+        case DLM_LOCK_NL:
+                return LM_ST_UNLOCKED;
+        case DLM_LOCK_EX:
+                return LM_ST_EXCLUSIVE;
+        case DLM_LOCK_CW:
+                return LM_ST_DEFERRED;
+        case DLM_LOCK_PR:
+                return LM_ST_SHARED;
+        }
+        gdlm_assert(0, "unknown DLM mode %d", dlmmode);
+        return -1;
+}
-        clear_bit(LFL_ACTIVE, &lp->flags);
+/* A lock placed on this queue is re-submitted to DLM as soon as the lock_dlm
+   thread gets to it. */
+static void queue_submit(struct gdlm_lock *lp)
+{
+        struct gdlm_ls *ls = lp->ls;
        spin_lock(&ls->async_lock);
-        list_add_tail(&lp->clist, &ls->complete);
+        list_add_tail(&lp->delay_list, &ls->submit);
        spin_unlock(&ls->async_lock);
        wake_up(&ls->thread_wait);
 }
-static inline void gdlm_ast(void *astarg)
+static void wake_up_ast(struct gdlm_lock *lp)
 {
-        queue_complete(astarg);
+        clear_bit(LFL_AST_WAIT, &lp->flags);
+        smp_mb__after_clear_bit();
+        wake_up_bit(&lp->flags, LFL_AST_WAIT);
 }
-static inline void gdlm_bast(void *astarg, int mode)
+static void gdlm_delete_lp(struct gdlm_lock *lp)
 {
-        struct gdlm_lock *lp = astarg;
        struct gdlm_ls *ls = lp->ls;
-        if (!mode) {
-                printk(KERN_INFO "lock_dlm: bast mode zero %x,%llx\n",
-                        lp->lockname.ln_type,
-                        (unsigned long long)lp->lockname.ln_number);
-                return;
-        }
        spin_lock(&ls->async_lock);
-        if (!lp->bast_mode) {
+        if (!list_empty(&lp->delay_list))
-                list_add_tail(&lp->blist, &ls->blocking);
+                list_del_init(&lp->delay_list);
-                lp->bast_mode = mode;
+        ls->all_locks_count--;
-        } else if (lp->bast_mode < mode)
-                lp->bast_mode = mode;
        spin_unlock(&ls->async_lock);
-        wake_up(&ls->thread_wait);
+        kfree(lp);
 }
-void gdlm_queue_delayed(struct gdlm_lock *lp)
+static void gdlm_queue_delayed(struct gdlm_lock *lp)
 {
        struct gdlm_ls *ls = lp->ls;
@@ -59,6 +73,236 @@ void gdlm_queue_delayed(struct gdlm_lock *lp)
        spin_unlock(&ls->async_lock);
 }
+static void process_complete(struct gdlm_lock *lp)
+{
+        struct gdlm_ls *ls = lp->ls;
+        struct lm_async_cb acb;
+        memset(&acb, 0, sizeof(acb));
+        if (lp->lksb.sb_status == -DLM_ECANCEL) {
+                log_info("complete dlm cancel %x,%llx flags %lx",
+                         lp->lockname.ln_type,
+                         (unsigned long long)lp->lockname.ln_number,
+                         lp->flags);
+                lp->req = lp->cur;
+                acb.lc_ret |= LM_OUT_CANCELED;
+                if (lp->cur == DLM_LOCK_IV)
+                        lp->lksb.sb_lkid = 0;
+                goto out;
+        }
+        if (test_and_clear_bit(LFL_DLM_UNLOCK, &lp->flags)) {
+                if (lp->lksb.sb_status != -DLM_EUNLOCK) {
+                        log_info("unlock sb_status %d %x,%llx flags %lx",
+                                 lp->lksb.sb_status, lp->lockname.ln_type,
+                                 (unsigned long long)lp->lockname.ln_number,
+                                 lp->flags);
+                        return;
+                }
+                lp->cur = DLM_LOCK_IV;
+                lp->req = DLM_LOCK_IV;
+                lp->lksb.sb_lkid = 0;
+                if (test_and_clear_bit(LFL_UNLOCK_DELETE, &lp->flags)) {
+                        gdlm_delete_lp(lp);
+                        return;
+                }
+                goto out;
+        }
+        if (lp->lksb.sb_flags & DLM_SBF_VALNOTVALID)
+                memset(lp->lksb.sb_lvbptr, 0, GDLM_LVB_SIZE);
+        if (lp->lksb.sb_flags & DLM_SBF_ALTMODE) {
+                if (lp->req == DLM_LOCK_PR)
+                        lp->req = DLM_LOCK_CW;
+                else if (lp->req == DLM_LOCK_CW)
+                        lp->req = DLM_LOCK_PR;
+        }
+        /*
+         * A canceled lock request.  The lock was just taken off the delayed
+         * list and was never even submitted to dlm.
+         */
+        if (test_and_clear_bit(LFL_CANCEL, &lp->flags)) {
+                log_info("complete internal cancel %x,%llx",
+                         lp->lockname.ln_type,
+                         (unsigned long long)lp->lockname.ln_number);
+                lp->req = lp->cur;
+                acb.lc_ret |= LM_OUT_CANCELED;
+                goto out;
+        }
+        /*
+         * An error occured.
+         */
+        if (lp->lksb.sb_status) {
+                /* a "normal" error */
+                if ((lp->lksb.sb_status == -EAGAIN) &&
+                    (lp->lkf & DLM_LKF_NOQUEUE)) {
+                        lp->req = lp->cur;
+                        if (lp->cur == DLM_LOCK_IV)
+                                lp->lksb.sb_lkid = 0;
+                        goto out;
+                }
+                /* this could only happen with cancels I think */
+                log_info("ast sb_status %d %x,%llx flags %lx",
+                         lp->lksb.sb_status, lp->lockname.ln_type,
+                         (unsigned long long)lp->lockname.ln_number,
+                         lp->flags);
+                return;
+        }
+        /*
+         * This is an AST for an EX->EX conversion for sync_lvb from GFS.
+         */
+        if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) {
+                wake_up_ast(lp);
+                return;
+        }
+        /*
+         * A lock has been demoted to NL because it initially completed during
+         * BLOCK_LOCKS.  Now it must be requested in the originally requested
+         * mode.
+         */
+        if (test_and_clear_bit(LFL_REREQUEST, &lp->flags)) {
+                gdlm_assert(lp->req == DLM_LOCK_NL, "%x,%llx",
+                            lp->lockname.ln_type,
+                            (unsigned long long)lp->lockname.ln_number);
+                gdlm_assert(lp->prev_req > DLM_LOCK_NL, "%x,%llx",
+                            lp->lockname.ln_type,
+                            (unsigned long long)lp->lockname.ln_number);
+                lp->cur = DLM_LOCK_NL;
+                lp->req = lp->prev_req;
+                lp->prev_req = DLM_LOCK_IV;
+                lp->lkf &= ~DLM_LKF_CONVDEADLK;
+                set_bit(LFL_NOCACHE, &lp->flags);
+                if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
+                    !test_bit(LFL_NOBLOCK, &lp->flags))
+                        gdlm_queue_delayed(lp);
+                else
+                        queue_submit(lp);
+                return;
+        }
+        /*
+         * A request is granted during dlm recovery.  It may be granted
+         * because the locks of a failed node were cleared.  In that case,
+         * there may be inconsistent data beneath this lock and we must wait
+         * for recovery to complete to use it.  When gfs recovery is done this
+         * granted lock will be converted to NL and then reacquired in this
+         * granted state.
+         */
+        if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
+            !test_bit(LFL_NOBLOCK, &lp->flags) &&
+            lp->req != DLM_LOCK_NL) {
+                lp->cur = lp->req;
+                lp->prev_req = lp->req;
+                lp->req = DLM_LOCK_NL;
+                lp->lkf |= DLM_LKF_CONVERT;
+                lp->lkf &= ~DLM_LKF_CONVDEADLK;
+                log_debug("rereq %x,%llx id %x %d,%d",
+                          lp->lockname.ln_type,
+                          (unsigned long long)lp->lockname.ln_number,
+                          lp->lksb.sb_lkid, lp->cur, lp->req);
+                set_bit(LFL_REREQUEST, &lp->flags);
+                queue_submit(lp);
+                return;
+        }
+        /*
+         * DLM demoted the lock to NL before it was granted so GFS must be
+         * told it cannot cache data for this lock.
+         */
+        if (lp->lksb.sb_flags & DLM_SBF_DEMOTED)
+                set_bit(LFL_NOCACHE, &lp->flags);
+out:
+        /*
+         * This is an internal lock_dlm lock
+         */
+        if (test_bit(LFL_INLOCK, &lp->flags)) {
+                clear_bit(LFL_NOBLOCK, &lp->flags);
+                lp->cur = lp->req;
+                wake_up_ast(lp);
+                return;
+        }
+        /*
+         * Normal completion of a lock request.  Tell GFS it now has the lock.
+         */
+        clear_bit(LFL_NOBLOCK, &lp->flags);
+        lp->cur = lp->req;
+        acb.lc_name = lp->lockname;
+        acb.lc_ret |= gdlm_make_lmstate(lp->cur);
+        ls->fscb(ls->sdp, LM_CB_ASYNC, &acb);
+}
+static void gdlm_ast(void *astarg)
+{
+        struct gdlm_lock *lp = astarg;
+        clear_bit(LFL_ACTIVE, &lp->flags);
+        process_complete(lp);
+}
+static void process_blocking(struct gdlm_lock *lp, int bast_mode)
+{
+        struct gdlm_ls *ls = lp->ls;
+        unsigned int cb = 0;
+        switch (gdlm_make_lmstate(bast_mode)) {
+        case LM_ST_EXCLUSIVE:
+                cb = LM_CB_NEED_E;
+                break;
+        case LM_ST_DEFERRED:
+                cb = LM_CB_NEED_D;
+                break;
+        case LM_ST_SHARED:
+                cb = LM_CB_NEED_S;
+                break;
+        default:
+                gdlm_assert(0, "unknown bast mode %u", bast_mode);
+        }
+        ls->fscb(ls->sdp, cb, &lp->lockname);
+}
+static void gdlm_bast(void *astarg, int mode)
+{
+        struct gdlm_lock *lp = astarg;
+        if (!mode) {
+                printk(KERN_INFO "lock_dlm: bast mode zero %x,%llx\n",
+                        lp->lockname.ln_type,
+                        (unsigned long long)lp->lockname.ln_number);
+                return;
+        }
+        process_blocking(lp, mode);
+}
 /* convert gfs lock-state to dlm lock-mode */
 static s16 make_mode(s16 lmstate)
@@ -77,24 +321,6 @@ static s16 make_mode(s16 lmstate)
        return -1;
 }
-/* convert dlm lock-mode to gfs lock-state */
-s16 gdlm_make_lmstate(s16 dlmmode)
-{
-        switch (dlmmode) {
-        case DLM_LOCK_IV:
-        case DLM_LOCK_NL:
-                return LM_ST_UNLOCKED;
-        case DLM_LOCK_EX:
-                return LM_ST_EXCLUSIVE;
-        case DLM_LOCK_CW:
-                return LM_ST_DEFERRED;
-        case DLM_LOCK_PR:
-                return LM_ST_SHARED;
-        }
-        gdlm_assert(0, "unknown DLM mode %d", dlmmode);
-        return -1;
-}
 /* verify agreement with GFS on the current lock state, NB: DLM_LOCK_NL and
   DLM_LOCK_IV are both considered LM_ST_UNLOCKED by GFS. */
@@ -134,14 +360,6 @@ static inline unsigned int make_flags(struct gdlm_lock *lp,
        if (lp->lksb.sb_lkid != 0) {
                lkf |= DLM_LKF_CONVERT;
-                /* Conversion deadlock avoidance by DLM */
-                if (!(lp->ls->fsflags & LM_MFLAG_CONV_NODROP) &&
-                    !test_bit(LFL_FORCE_PROMOTE, &lp->flags) &&
-                    !(lkf & DLM_LKF_NOQUEUE) &&
-                    cur > DLM_LOCK_NL && req > DLM_LOCK_NL && cur != req)
-                        lkf |= DLM_LKF_CONVDEADLK;
        }
        if (lp->lvb)
@@ -173,14 +391,9 @@ static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
        make_strname(name, &lp->strname);
        lp->ls = ls;
        lp->cur = DLM_LOCK_IV;
-        lp->lvb = NULL;
-        lp->hold_null = NULL;
-        INIT_LIST_HEAD(&lp->clist);
-        INIT_LIST_HEAD(&lp->blist);
        INIT_LIST_HEAD(&lp->delay_list);
        spin_lock(&ls->async_lock);
-        list_add(&lp->all_list, &ls->all_locks);
        ls->all_locks_count++;
        spin_unlock(&ls->async_lock);
@@ -188,26 +401,6 @@ static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
        return 0;
 }
-void gdlm_delete_lp(struct gdlm_lock *lp)
-{
-        struct gdlm_ls *ls = lp->ls;
-        spin_lock(&ls->async_lock);
-        if (!list_empty(&lp->clist))
-                list_del_init(&lp->clist);
-        if (!list_empty(&lp->blist))
-                list_del_init(&lp->blist);
-        if (!list_empty(&lp->delay_list))
-                list_del_init(&lp->delay_list);
-        gdlm_assert(!list_empty(&lp->all_list), "%x,%llx", lp->lockname.ln_type,
-                    (unsigned long long)lp->lockname.ln_number);
-        list_del_init(&lp->all_list);
-        ls->all_locks_count--;
-        spin_unlock(&ls->async_lock);
-        kfree(lp);
-}
 int gdlm_get_lock(void *lockspace, struct lm_lockname *name,
                  void **lockp)
 {
@@ -261,7 +454,7 @@ unsigned int gdlm_do_lock(struct gdlm_lock *lp)
        if ((error == -EAGAIN) && (lp->lkf & DLM_LKF_NOQUEUE)) {
                lp->lksb.sb_status = -EAGAIN;
-                queue_complete(lp);
+                gdlm_ast(lp);
                error = 0;
        }
@@ -308,6 +501,12 @@ unsigned int gdlm_lock(void *lock, unsigned int cur_state,
 {
        struct gdlm_lock *lp = lock;
+        if (req_state == LM_ST_UNLOCKED)
+                return gdlm_unlock(lock, cur_state);
+        if (req_state == LM_ST_UNLOCKED)
+                return gdlm_unlock(lock, cur_state);
        clear_bit(LFL_DLM_CANCEL, &lp->flags);
        if (flags & LM_FLAG_NOEXP)
                set_bit(LFL_NOBLOCK, &lp->flags);
@@ -351,7 +550,7 @@ void gdlm_cancel(void *lock)
        if (delay_list) {
                set_bit(LFL_CANCEL, &lp->flags);
                set_bit(LFL_ACTIVE, &lp->flags);
-                queue_complete(lp);
+                gdlm_ast(lp);
                return;
        }
@@ -507,22 +706,3 @@ void gdlm_submit_delayed(struct gdlm_ls *ls)
        wake_up(&ls->thread_wait);
 }
-int gdlm_release_all_locks(struct gdlm_ls *ls)
-{
-        struct gdlm_lock *lp, *safe;
-        int count = 0;
-        spin_lock(&ls->async_lock);
-        list_for_each_entry_safe(lp, safe, &ls->all_locks, all_list) {
-                list_del_init(&lp->all_list);
-                if (lp->lvb && lp->lvb != junk_lvb)
-                        kfree(lp->lvb);
-                kfree(lp);
-                count++;
-        }
-        spin_unlock(&ls->async_lock);
-        return count;
-}
diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
index a243cf69c54e..3c98e7c6f93b 100644
--- a/fs/gfs2/locking/dlm/lock_dlm.h
+++ b/fs/gfs2/locking/dlm/lock_dlm.h
@@ -72,19 +72,12 @@ struct gdlm_ls {
        int                     recover_jid_done;
        int                     recover_jid_status;
        spinlock_t              async_lock;
-        struct list_head        complete;
-        struct list_head        blocking;
        struct list_head        delayed;
        struct list_head        submit;
-        struct list_head        all_locks;
        u32             all_locks_count;
        wait_queue_head_t       wait_control;
-        struct task_struct      *thread1;
+        struct task_struct      *thread;
-        struct task_struct      *thread2;
        wait_queue_head_t       thread_wait;
-        unsigned long           drop_time;
-        int                     drop_locks_count;
-        int                     drop_locks_period;
 };
 enum {
@@ -117,12 +110,7 @@ struct gdlm_lock {
        u32                     lkf;            /* dlm flags DLM_LKF_ */
        unsigned long           flags;          /* lock_dlm flags LFL_ */
-        int                     bast_mode;      /* protected by async_lock */
-        struct list_head        clist;          /* complete */
-        struct list_head        blist;          /* blocking */
        struct list_head        delay_list;     /* delayed */
-        struct list_head        all_list;       /* all locks for the fs */
        struct gdlm_lock        *hold_null;     /* NL lock for hold_lvb */
 };
@@ -159,11 +147,7 @@ void gdlm_release_threads(struct gdlm_ls *);
 /* lock.c */
-s16 gdlm_make_lmstate(s16);
-void gdlm_queue_delayed(struct gdlm_lock *);
 void gdlm_submit_delayed(struct gdlm_ls *);
-int gdlm_release_all_locks(struct gdlm_ls *);
-void gdlm_delete_lp(struct gdlm_lock *);
 unsigned int gdlm_do_lock(struct gdlm_lock *);
 int gdlm_get_lock(void *, struct lm_lockname *, void **);
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
index 470bdf650b50..09d78c216f48 100644
--- a/fs/gfs2/locking/dlm/mount.c
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -22,22 +22,14 @@ static struct gdlm_ls *init_gdlm(lm_callback_t cb, struct gfs2_sbd *sdp,
        if (!ls)
                return NULL;
-        ls->drop_locks_count = GDLM_DROP_COUNT;
-        ls->drop_locks_period = GDLM_DROP_PERIOD;
        ls->fscb = cb;
        ls->sdp = sdp;
        ls->fsflags = flags;
        spin_lock_init(&ls->async_lock);
-        INIT_LIST_HEAD(&ls->complete);
-        INIT_LIST_HEAD(&ls->blocking);
        INIT_LIST_HEAD(&ls->delayed);
        INIT_LIST_HEAD(&ls->submit);
-        INIT_LIST_HEAD(&ls->all_locks);
        init_waitqueue_head(&ls->thread_wait);
        init_waitqueue_head(&ls->wait_control);
-        ls->thread1 = NULL;
-        ls->thread2 = NULL;
-        ls->drop_time = jiffies;
        ls->jid = -1;
        strncpy(buf, table_name, 256);
@@ -180,7 +172,6 @@ out:
 static void gdlm_unmount(void *lockspace)
 {
        struct gdlm_ls *ls = lockspace;
-        int rv;
        log_debug("unmount flags %lx", ls->flags);
@@ -194,9 +185,7 @@ static void gdlm_unmount(void *lockspace)
        gdlm_kobject_release(ls);
        dlm_release_lockspace(ls->dlm_lockspace, 2);
        gdlm_release_threads(ls);
-        rv = gdlm_release_all_locks(ls);
+        BUG_ON(ls->all_locks_count);
-        if (rv)
-                log_info("gdlm_unmount: %d stray locks freed", rv);
 out:
        kfree(ls);
 }
@@ -232,7 +221,6 @@ static void gdlm_withdraw(void *lockspace)
        dlm_release_lockspace(ls->dlm_lockspace, 2);
        gdlm_release_threads(ls);
-        gdlm_release_all_locks(ls);
        gdlm_kobject_release(ls);
 }
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index a4ff271df9ee..4ec571c3d8a9 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -114,17 +114,6 @@ static ssize_t recover_status_show(struct gdlm_ls *ls, char *buf)
        return sprintf(buf, "%d\n", ls->recover_jid_status);
 }
-static ssize_t drop_count_show(struct gdlm_ls *ls, char *buf)
-{
-        return sprintf(buf, "%d\n", ls->drop_locks_count);
-}
-static ssize_t drop_count_store(struct gdlm_ls *ls, const char *buf, size_t len)
-{
-        ls->drop_locks_count = simple_strtol(buf, NULL, 0);
-        return len;
-}
 struct gdlm_attr {
        struct attribute attr;
        ssize_t (*show)(struct gdlm_ls *, char *);
@@ -144,7 +133,6 @@ GDLM_ATTR(first_done,     0444, first_done_show,     NULL);
 GDLM_ATTR(recover,        0644, recover_show,        recover_store);
 GDLM_ATTR(recover_done,   0444, recover_done_show,   NULL);
 GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
-GDLM_ATTR(drop_count,     0644, drop_count_show,     drop_count_store);
 static struct attribute *gdlm_attrs[] = {
        &gdlm_attr_proto_name.attr,
@@ -157,7 +145,6 @@ static struct attribute *gdlm_attrs[] = {
        &gdlm_attr_recover.attr,
        &gdlm_attr_recover_done.attr,
        &gdlm_attr_recover_status.attr,
-        &gdlm_attr_drop_count.attr,
        NULL,
 };
diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c
index e53db6fd28ab..38823efd698c 100644
--- a/fs/gfs2/locking/dlm/thread.c
+++ b/fs/gfs2/locking/dlm/thread.c
@@ -9,367 +9,60 @@
 #include "lock_dlm.h"
-/* A lock placed on this queue is re-submitted to DLM as soon as the lock_dlm
+static inline int no_work(struct gdlm_ls *ls)
-   thread gets to it. */
-static void queue_submit(struct gdlm_lock *lp)
-{
-        struct gdlm_ls *ls = lp->ls;
-        spin_lock(&ls->async_lock);
-        list_add_tail(&lp->delay_list, &ls->submit);
-        spin_unlock(&ls->async_lock);
-        wake_up(&ls->thread_wait);
-}
-static void process_blocking(struct gdlm_lock *lp, int bast_mode)
-{
-        struct gdlm_ls *ls = lp->ls;
-        unsigned int cb = 0;
-        switch (gdlm_make_lmstate(bast_mode)) {
-        case LM_ST_EXCLUSIVE:
-                cb = LM_CB_NEED_E;
-                break;
-        case LM_ST_DEFERRED:
-                cb = LM_CB_NEED_D;
-                break;
-        case LM_ST_SHARED:
-                cb = LM_CB_NEED_S;
-                break;
-        default:
-                gdlm_assert(0, "unknown bast mode %u", lp->bast_mode);
-        }
-        ls->fscb(ls->sdp, cb, &lp->lockname);
-}
-static void wake_up_ast(struct gdlm_lock *lp)
-{
-        clear_bit(LFL_AST_WAIT, &lp->flags);
-        smp_mb__after_clear_bit();
-        wake_up_bit(&lp->flags, LFL_AST_WAIT);
-}
-static void process_complete(struct gdlm_lock *lp)
-{
-        struct gdlm_ls *ls = lp->ls;
-        struct lm_async_cb acb;
-        s16 prev_mode = lp->cur;
-        memset(&acb, 0, sizeof(acb));
-        if (lp->lksb.sb_status == -DLM_ECANCEL) {
-                log_info("complete dlm cancel %x,%llx flags %lx",
-                         lp->lockname.ln_type,
-                         (unsigned long long)lp->lockname.ln_number,
-                         lp->flags);
-                lp->req = lp->cur;
-                acb.lc_ret |= LM_OUT_CANCELED;
-                if (lp->cur == DLM_LOCK_IV)
-                        lp->lksb.sb_lkid = 0;
-                goto out;
-        }
-        if (test_and_clear_bit(LFL_DLM_UNLOCK, &lp->flags)) {
-                if (lp->lksb.sb_status != -DLM_EUNLOCK) {
-                        log_info("unlock sb_status %d %x,%llx flags %lx",
-                                 lp->lksb.sb_status, lp->lockname.ln_type,
-                                 (unsigned long long)lp->lockname.ln_number,
-                                 lp->flags);
-                        return;
-                }
-                lp->cur = DLM_LOCK_IV;
-                lp->req = DLM_LOCK_IV;
-                lp->lksb.sb_lkid = 0;
-                if (test_and_clear_bit(LFL_UNLOCK_DELETE, &lp->flags)) {
-                        gdlm_delete_lp(lp);
-                        return;
-                }
-                goto out;
-        }
-        if (lp->lksb.sb_flags & DLM_SBF_VALNOTVALID)
-                memset(lp->lksb.sb_lvbptr, 0, GDLM_LVB_SIZE);
-        if (lp->lksb.sb_flags & DLM_SBF_ALTMODE) {
-                if (lp->req == DLM_LOCK_PR)
-                        lp->req = DLM_LOCK_CW;
-                else if (lp->req == DLM_LOCK_CW)
-                        lp->req = DLM_LOCK_PR;
-        }
-        /*
-         * A canceled lock request.  The lock was just taken off the delayed
-         * list and was never even submitted to dlm.
-         */
-        if (test_and_clear_bit(LFL_CANCEL, &lp->flags)) {
-                log_info("complete internal cancel %x,%llx",
-                         lp->lockname.ln_type,
-                         (unsigned long long)lp->lockname.ln_number);
-                lp->req = lp->cur;
-                acb.lc_ret |= LM_OUT_CANCELED;
-                goto out;
-        }
-        /*
-         * An error occured.
-         */
-        if (lp->lksb.sb_status) {
-                /* a "normal" error */
-                if ((lp->lksb.sb_status == -EAGAIN) &&
-                    (lp->lkf & DLM_LKF_NOQUEUE)) {
-                        lp->req = lp->cur;
-                        if (lp->cur == DLM_LOCK_IV)
-                                lp->lksb.sb_lkid = 0;
-                        goto out;
-                }
-                /* this could only happen with cancels I think */
-                log_info("ast sb_status %d %x,%llx flags %lx",
-                         lp->lksb.sb_status, lp->lockname.ln_type,
-                         (unsigned long long)lp->lockname.ln_number,
-                         lp->flags);
-                if (lp->lksb.sb_status == -EDEADLOCK &&
-                    lp->ls->fsflags & LM_MFLAG_CONV_NODROP) {
-                        lp->req = lp->cur;
-                        acb.lc_ret |= LM_OUT_CONV_DEADLK;
-                        if (lp->cur == DLM_LOCK_IV)
-                                lp->lksb.sb_lkid = 0;
-                        goto out;
-                } else
-                        return;
-        }
-        /*
-         * This is an AST for an EX->EX conversion for sync_lvb from GFS.
-         */
-        if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) {
-                wake_up_ast(lp);
-                return;
-        }
-        /*
-         * A lock has been demoted to NL because it initially completed during
-         * BLOCK_LOCKS.  Now it must be requested in the originally requested
-         * mode.
-         */
-        if (test_and_clear_bit(LFL_REREQUEST, &lp->flags)) {
-                gdlm_assert(lp->req == DLM_LOCK_NL, "%x,%llx",
-                            lp->lockname.ln_type,
-                            (unsigned long long)lp->lockname.ln_number);
-                gdlm_assert(lp->prev_req > DLM_LOCK_NL, "%x,%llx",
-                            lp->lockname.ln_type,
-                            (unsigned long long)lp->lockname.ln_number);
-                lp->cur = DLM_LOCK_NL;
-                lp->req = lp->prev_req;
-                lp->prev_req = DLM_LOCK_IV;
-                lp->lkf &= ~DLM_LKF_CONVDEADLK;
-                set_bit(LFL_NOCACHE, &lp->flags);
-                if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
-                    !test_bit(LFL_NOBLOCK, &lp->flags))
-                        gdlm_queue_delayed(lp);
-                else
-                        queue_submit(lp);
-                return;
-        }
-        /*
-         * A request is granted during dlm recovery.  It may be granted
-         * because the locks of a failed node were cleared.  In that case,
-         * there may be inconsistent data beneath this lock and we must wait
-         * for recovery to complete to use it.  When gfs recovery is done this
-         * granted lock will be converted to NL and then reacquired in this
-         * granted state.
-         */
-        if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
-            !test_bit(LFL_NOBLOCK, &lp->flags) &&
-            lp->req != DLM_LOCK_NL) {
-                lp->cur = lp->req;
-                lp->prev_req = lp->req;
-                lp->req = DLM_LOCK_NL;
-                lp->lkf |= DLM_LKF_CONVERT;
-                lp->lkf &= ~DLM_LKF_CONVDEADLK;
-                log_debug("rereq %x,%llx id %x %d,%d",
-                          lp->lockname.ln_type,
-                          (unsigned long long)lp->lockname.ln_number,
-                          lp->lksb.sb_lkid, lp->cur, lp->req);
-                set_bit(LFL_REREQUEST, &lp->flags);
-                queue_submit(lp);
-                return;
-        }
-        /*
-         * DLM demoted the lock to NL before it was granted so GFS must be
-         * told it cannot cache data for this lock.
-         */
-        if (lp->lksb.sb_flags & DLM_SBF_DEMOTED)
-                set_bit(LFL_NOCACHE, &lp->flags);
-out:
-        /*
-         * This is an internal lock_dlm lock
-         */
-        if (test_bit(LFL_INLOCK, &lp->flags)) {
-                clear_bit(LFL_NOBLOCK, &lp->flags);
-                lp->cur = lp->req;
-                wake_up_ast(lp);
-                return;
-        }
-        /*
-         * Normal completion of a lock request.  Tell GFS it now has the lock.
-         */
-        clear_bit(LFL_NOBLOCK, &lp->flags);
-        lp->cur = lp->req;
-        acb.lc_name = lp->lockname;
-        acb.lc_ret |= gdlm_make_lmstate(lp->cur);
-        if (!test_and_clear_bit(LFL_NOCACHE, &lp->flags) &&
-            (lp->cur > DLM_LOCK_NL) && (prev_mode > DLM_LOCK_NL))
-                acb.lc_ret |= LM_OUT_CACHEABLE;
-        ls->fscb(ls->sdp, LM_CB_ASYNC, &acb);
-}
-static inline int no_work(struct gdlm_ls *ls, int blocking)
 {
        int ret;
        spin_lock(&ls->async_lock);
-        ret = list_empty(&ls->complete) && list_empty(&ls->submit);
+        ret = list_empty(&ls->submit);
-        if (ret && blocking)
-                ret = list_empty(&ls->blocking);
        spin_unlock(&ls->async_lock);
        return ret;
 }
-static inline int check_drop(struct gdlm_ls *ls)
+static int gdlm_thread(void *data)
-{
-        if (!ls->drop_locks_count)
-                return 0;
-        if (time_after(jiffies, ls->drop_time + ls->drop_locks_period * HZ)) {
-                ls->drop_time = jiffies;
-                if (ls->all_locks_count >= ls->drop_locks_count)
-                        return 1;
-        }
-        return 0;
-}
-static int gdlm_thread(void *data, int blist)
 {
        struct gdlm_ls *ls = (struct gdlm_ls *) data;
        struct gdlm_lock *lp = NULL;
-        uint8_t complete, blocking, submit, drop;
-        /* Only thread1 is allowed to do blocking callbacks since gfs
-           may wait for a completion callback within a blocking cb. */
        while (!kthread_should_stop()) {
                wait_event_interruptible(ls->thread_wait,
-                                !no_work(ls, blist) || kthread_should_stop());
+                                !no_work(ls) || kthread_should_stop());
-                complete = blocking = submit = drop = 0;
                spin_lock(&ls->async_lock);
-                if (blist && !list_empty(&ls->blocking)) {
+                if (!list_empty(&ls->submit)) {
-                        lp = list_entry(ls->blocking.next, struct gdlm_lock,
-                                        blist);
-                        list_del_init(&lp->blist);
-                        blocking = lp->bast_mode;
-                        lp->bast_mode = 0;
-                } else if (!list_empty(&ls->complete)) {
-                        lp = list_entry(ls->complete.next, struct gdlm_lock,
-                                        clist);
-                        list_del_init(&lp->clist);
-                        complete = 1;
-                } else if (!list_empty(&ls->submit)) {
                        lp = list_entry(ls->submit.next, struct gdlm_lock,
                                        delay_list);
                        list_del_init(&lp->delay_list);
-                        submit = 1;
+                        spin_unlock(&ls->async_lock);
+                        gdlm_do_lock(lp);
+                        spin_lock(&ls->async_lock);
                }
-                drop = check_drop(ls);
                spin_unlock(&ls->async_lock);
-                if (complete)
-                        process_complete(lp);
-                else if (blocking)
-                        process_blocking(lp, blocking);
-                else if (submit)
-                        gdlm_do_lock(lp);
-                if (drop)
-                        ls->fscb(ls->sdp, LM_CB_DROPLOCKS, NULL);
-                schedule();
        }
        return 0;
 }
-static int gdlm_thread1(void *data)
-{
-        return gdlm_thread(data, 1);
-}
-static int gdlm_thread2(void *data)
-{
-        return gdlm_thread(data, 0);
-}
 int gdlm_init_threads(struct gdlm_ls *ls)
 {
        struct task_struct *p;
        int error;
-        p = kthread_run(gdlm_thread1, ls, "lock_dlm1");
+        p = kthread_run(gdlm_thread, ls, "lock_dlm");
-        error = IS_ERR(p);
-        if (error) {
-                log_error("can't start lock_dlm1 thread %d", error);
-                return error;
-        }
-        ls->thread1 = p;
-        p = kthread_run(gdlm_thread2, ls, "lock_dlm2");
        error = IS_ERR(p);
        if (error) {
-                log_error("can't start lock_dlm2 thread %d", error);
+                log_error("can't start lock_dlm thread %d", error);
-                kthread_stop(ls->thread1);
                return error;
        }
-        ls->thread2 = p;
+        ls->thread = p;
        return 0;
 }
 void gdlm_release_threads(struct gdlm_ls *ls)
 {
-        kthread_stop(ls->thread1);
+        kthread_stop(ls->thread);
-        kthread_stop(ls->thread2);
 }
diff --git a/fs/gfs2/locking/nolock/Makefile b/fs/gfs2/locking/nolock/Makefile
deleted file mode 100644
index 35e9730bc3a8..000000000000
--- a/fs/gfs2/locking/nolock/Makefile
+++ /dev/null
@@ -1,3 +0,0 @@
-obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += lock_nolock.o
-lock_nolock-y := main.o
diff --git a/fs/gfs2/locking/nolock/main.c b/fs/gfs2/locking/nolock/main.c
deleted file mode 100644
index 284a5ece8d94..000000000000
--- a/fs/gfs2/locking/nolock/main.c
+++ /dev/null
@@ -1,238 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/fs.h>
-#include <linux/lm_interface.h>
-struct nolock_lockspace {
-        unsigned int nl_lvb_size;
-};
-static const struct lm_lockops nolock_ops;
-static int nolock_mount(char *table_name, char *host_data,
-                        lm_callback_t cb, void *cb_data,
-                        unsigned int min_lvb_size, int flags,
-                        struct lm_lockstruct *lockstruct,
-                        struct kobject *fskobj)
-{
-        char *c;
-        unsigned int jid;
-        struct nolock_lockspace *nl;
-        c = strstr(host_data, "jid=");
-        if (!c)
-                jid = 0;
-        else {
-                c += 4;
-                sscanf(c, "%u", &jid);
-        }
-        nl = kzalloc(sizeof(struct nolock_lockspace), GFP_KERNEL);
-        if (!nl)
-                return -ENOMEM;
-        nl->nl_lvb_size = min_lvb_size;
-        lockstruct->ls_jid = jid;
-        lockstruct->ls_first = 1;
-        lockstruct->ls_lvb_size = min_lvb_size;
-        lockstruct->ls_lockspace = nl;
-        lockstruct->ls_ops = &nolock_ops;
-        lockstruct->ls_flags = LM_LSFLAG_LOCAL;
-        return 0;
-}
-static void nolock_others_may_mount(void *lockspace)
-{
-}
-static void nolock_unmount(void *lockspace)
-{
-        struct nolock_lockspace *nl = lockspace;
-        kfree(nl);
-}
-static void nolock_withdraw(void *lockspace)
-{
-}
-/**
- * nolock_get_lock - get a lm_lock_t given a descripton of the lock
- * @lockspace: the lockspace the lock lives in
- * @name: the name of the lock
- * @lockp: return the lm_lock_t here
- *
- * Returns: 0 on success, -EXXX on failure
- */
-static int nolock_get_lock(void *lockspace, struct lm_lockname *name,
-                           void **lockp)
-{
-        *lockp = lockspace;
-        return 0;
-}
-/**
- * nolock_put_lock - get rid of a lock structure
- * @lock: the lock to throw away
- *
- */
-static void nolock_put_lock(void *lock)
-{
-}
-/**
- * nolock_lock - acquire a lock
- * @lock: the lock to manipulate
- * @cur_state: the current state
- * @req_state: the requested state
- * @flags: modifier flags
- *
- * Returns: A bitmap of LM_OUT_*
- */
-static unsigned int nolock_lock(void *lock, unsigned int cur_state,
-                                unsigned int req_state, unsigned int flags)
-{
-        return req_state | LM_OUT_CACHEABLE;
-}
-/**
- * nolock_unlock - unlock a lock
- * @lock: the lock to manipulate
- * @cur_state: the current state
- *
- * Returns: 0
- */
-static unsigned int nolock_unlock(void *lock, unsigned int cur_state)
-{
-        return 0;
-}
-static void nolock_cancel(void *lock)
-{
-}
-/**
- * nolock_hold_lvb - hold on to a lock value block
- * @lock: the lock the LVB is associated with
- * @lvbp: return the lm_lvb_t here
- *
- * Returns: 0 on success, -EXXX on failure
- */
-static int nolock_hold_lvb(void *lock, char **lvbp)
-{
-        struct nolock_lockspace *nl = lock;
-        int error = 0;
-        *lvbp = kzalloc(nl->nl_lvb_size, GFP_NOFS);
-        if (!*lvbp)
-                error = -ENOMEM;
-        return error;
-}
-/**
- * nolock_unhold_lvb - release a LVB
- * @lock: the lock the LVB is associated with
- * @lvb: the lock value block
- *
- */
-static void nolock_unhold_lvb(void *lock, char *lvb)
-{
-        kfree(lvb);
-}
-static int nolock_plock_get(void *lockspace, struct lm_lockname *name,
-                            struct file *file, struct file_lock *fl)
-{
-        posix_test_lock(file, fl);
-        return 0;
-}
-static int nolock_plock(void *lockspace, struct lm_lockname *name,
-                        struct file *file, int cmd, struct file_lock *fl)
-{
-        int error;
-        error = posix_lock_file_wait(file, fl);
-        return error;
-}
-static int nolock_punlock(void *lockspace, struct lm_lockname *name,
-                          struct file *file, struct file_lock *fl)
-{
-        int error;
-        error = posix_lock_file_wait(file, fl);
-        return error;
-}
-static void nolock_recovery_done(void *lockspace, unsigned int jid,
-                                 unsigned int message)
-{
-}
-static const struct lm_lockops nolock_ops = {
-        .lm_proto_name = "lock_nolock",
-        .lm_mount = nolock_mount,
-        .lm_others_may_mount = nolock_others_may_mount,
-        .lm_unmount = nolock_unmount,
-        .lm_withdraw = nolock_withdraw,
-        .lm_get_lock = nolock_get_lock,
-        .lm_put_lock = nolock_put_lock,
-        .lm_lock = nolock_lock,
-        .lm_unlock = nolock_unlock,
-        .lm_cancel = nolock_cancel,
-        .lm_hold_lvb = nolock_hold_lvb,
-        .lm_unhold_lvb = nolock_unhold_lvb,
-        .lm_plock_get = nolock_plock_get,
-        .lm_plock = nolock_plock,
-        .lm_punlock = nolock_punlock,
-        .lm_recovery_done = nolock_recovery_done,
-        .lm_owner = THIS_MODULE,
-};
-static int __init init_nolock(void)
-{
-        int error;
-        error = gfs2_register_lockproto(&nolock_ops);
-        if (error) {
-                printk(KERN_WARNING
-                       "lock_nolock: can't register protocol: %d\n", error);
-                return error;
-        }
-        printk(KERN_INFO
-               "Lock_Nolock (built %s %s) installed\n", __DATE__, __TIME__);
-        return 0;
-}
-static void __exit exit_nolock(void)
-{
-        gfs2_unregister_lockproto(&nolock_ops);
-}
-module_init(init_nolock);
-module_exit(exit_nolock);
-MODULE_DESCRIPTION("GFS Nolock Locking Module");
-MODULE_AUTHOR("Red Hat, Inc.");
-MODULE_LICENSE("GPL");
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 548264b1836d..6c6af9f5e3ab 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -87,6 +87,8 @@ void gfs2_remove_from_ail(struct gfs2_bufdata *bd)
 */
 static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+__releases(&sdp->sd_log_lock)
+__acquires(&sdp->sd_log_lock)
 {
        struct gfs2_bufdata *bd, *s;
        struct buffer_head *bh;
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 771152816508..7c64510ccfd2 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -21,6 +21,7 @@
 */
 static inline void gfs2_log_lock(struct gfs2_sbd *sdp)
+__acquires(&sdp->sd_log_lock)
 {
        spin_lock(&sdp->sd_log_lock);
 }
@@ -32,6 +33,7 @@ static inline void gfs2_log_lock(struct gfs2_sbd *sdp)
 */
 static inline void gfs2_log_unlock(struct gfs2_sbd *sdp)
+__releases(&sdp->sd_log_lock)
 {
        spin_unlock(&sdp->sd_log_lock);
 }
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 053e2ebbbd50..bcc668d0fadd 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -40,8 +40,6 @@ static void gfs2_init_glock_once(struct kmem_cache *cachep, void *foo)
        INIT_HLIST_NODE(&gl->gl_list);
        spin_lock_init(&gl->gl_spin);
        INIT_LIST_HEAD(&gl->gl_holders);
-        INIT_LIST_HEAD(&gl->gl_waiters1);
-        INIT_LIST_HEAD(&gl->gl_waiters3);
        gl->gl_lvb = NULL;
        atomic_set(&gl->gl_lvb_count, 0);
        INIT_LIST_HEAD(&gl->gl_reclaim);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 78d75f892f82..09853620c951 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -129,7 +129,7 @@ void gfs2_meta_sync(struct gfs2_glock *gl)
 }
 /**
- * getbuf - Get a buffer with a given address space
+ * gfs2_getbuf - Get a buffer with a given address space
 * @gl: the glock
 * @blkno: the block number (filesystem scope)
 * @create: 1 if the buffer should be created
@@ -137,7 +137,7 @@ void gfs2_meta_sync(struct gfs2_glock *gl)
 * Returns: the buffer
 */
-static struct buffer_head *getbuf(struct gfs2_glock *gl, u64 blkno, int create)
+struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
 {
        struct address_space *mapping = gl->gl_aspace->i_mapping;
        struct gfs2_sbd *sdp = gl->gl_sbd;
@@ -205,7 +205,7 @@ static void meta_prep_new(struct buffer_head *bh)
 struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
 {
        struct buffer_head *bh;
-        bh = getbuf(gl, blkno, CREATE);
+        bh = gfs2_getbuf(gl, blkno, CREATE);
        meta_prep_new(bh);
        return bh;
 }
@@ -223,7 +223,7 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
 int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
                   struct buffer_head **bhp)
 {
-        *bhp = getbuf(gl, blkno, CREATE);
+        *bhp = gfs2_getbuf(gl, blkno, CREATE);
        if (!buffer_uptodate(*bhp)) {
                ll_rw_block(READ_META, 1, bhp);
                if (flags & DIO_WAIT) {
@@ -346,7 +346,7 @@ void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen)
        struct buffer_head *bh;
        while (blen) {
-                bh = getbuf(ip->i_gl, bstart, NO_CREATE);
+                bh = gfs2_getbuf(ip->i_gl, bstart, NO_CREATE);
                if (bh) {
                        lock_buffer(bh);
                        gfs2_log_lock(sdp);
@@ -421,7 +421,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
        if (extlen > max_ra)
                extlen = max_ra;
-        first_bh = getbuf(gl, dblock, CREATE);
+        first_bh = gfs2_getbuf(gl, dblock, CREATE);
        if (buffer_uptodate(first_bh))
                goto out;
@@ -432,7 +432,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
        extlen--;
        while (extlen) {
-                bh = getbuf(gl, dblock, CREATE);
+                bh = gfs2_getbuf(gl, dblock, CREATE);
                if (!buffer_uptodate(bh) && !buffer_locked(bh))
                        ll_rw_block(READA, 1, &bh);
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index 73e3b1c76fe1..b1a5f3674d43 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -47,6 +47,7 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno);
 int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno,
                   int flags, struct buffer_head **bhp);
 int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
+struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create);
 void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
                         int meta);
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index f55394e57cb2..e64a1b04117a 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -499,34 +499,34 @@ static int __gfs2_readpage(void *file, struct page *page)
 * @file: The file to read
 * @page: The page of the file
 *
- * This deals with the locking required. We use a trylock in order to
+ * This deals with the locking required. We have to unlock and
- * avoid the page lock / glock ordering problems returning AOP_TRUNCATED_PAGE
+ * relock the page in order to get the locking in the right
- * in the event that we are unable to get the lock.
+ * order.
 */
 static int gfs2_readpage(struct file *file, struct page *page)
 {
-        struct gfs2_inode *ip = GFS2_I(page->mapping->host);
+        struct address_space *mapping = page->mapping;
-        struct gfs2_holder *gh;
+        struct gfs2_inode *ip = GFS2_I(mapping->host);
+        struct gfs2_holder gh;
        int error;
-        gh = gfs2_glock_is_locked_by_me(ip->i_gl);
+        unlock_page(page);
-        if (!gh) {
+        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
-                gh = kmalloc(sizeof(struct gfs2_holder), GFP_NOFS);
+        error = gfs2_glock_nq_atime(&gh);
-                if (!gh)
+        if (unlikely(error))
-                        return -ENOBUFS;
+                goto out;
-                gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, gh);
+        error = AOP_TRUNCATED_PAGE;
+        lock_page(page);
+        if (page->mapping == mapping && !PageUptodate(page))
+                error = __gfs2_readpage(file, page);
+        else
                unlock_page(page);
-                error = gfs2_glock_nq_atime(gh);
+        gfs2_glock_dq(&gh);
-                if (likely(error != 0))
-                        goto out;
-                return AOP_TRUNCATED_PAGE;
-        }
-        error = __gfs2_readpage(file, page);
-        gfs2_glock_dq(gh);
 out:
-        gfs2_holder_uninit(gh);
+        gfs2_holder_uninit(&gh);
-        kfree(gh);
+        if (error && error != AOP_TRUNCATED_PAGE)
+                lock_page(page);
        return error;
 }
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index e1b7d525a066..e9a366d4411c 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -15,6 +15,7 @@
 #include <linux/uio.h>
 #include <linux/blkdev.h>
 #include <linux/mm.h>
+#include <linux/mount.h>
 #include <linux/fs.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/ext2_fs.h>
@@ -62,11 +63,11 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin)
                error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
                                           &i_gh);
                if (!error) {
-                        error = remote_llseek(file, offset, origin);
+                        error = generic_file_llseek_unlocked(file, offset, origin);
                        gfs2_glock_dq_uninit(&i_gh);
                }
        } else
-                error = remote_llseek(file, offset, origin);
+                error = generic_file_llseek_unlocked(file, offset, origin);
        return error;
 }
@@ -133,7 +134,6 @@ static const u32 fsflags_to_gfs2[32] = {
        [7] = GFS2_DIF_NOATIME,
        [12] = GFS2_DIF_EXHASH,
        [14] = GFS2_DIF_INHERIT_JDATA,
-        [20] = GFS2_DIF_INHERIT_DIRECTIO,
 };
 static const u32 gfs2_to_fsflags[32] = {
@@ -142,7 +142,6 @@ static const u32 gfs2_to_fsflags[32] = {
        [gfs2fl_AppendOnly] = FS_APPEND_FL,
        [gfs2fl_NoAtime] = FS_NOATIME_FL,
        [gfs2fl_ExHash] = FS_INDEX_FL,
-        [gfs2fl_InheritDirectio] = FS_DIRECTIO_FL,
        [gfs2fl_InheritJdata] = FS_JOURNAL_DATA_FL,
 };
@@ -160,12 +159,8 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
                return error;
        fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_di.di_flags);
-        if (!S_ISDIR(inode->i_mode)) {
+        if (!S_ISDIR(inode->i_mode) && ip->i_di.di_flags & GFS2_DIF_JDATA)
-                if (ip->i_di.di_flags & GFS2_DIF_JDATA)
+                fsflags |= FS_JOURNAL_DATA_FL;
-                        fsflags |= FS_JOURNAL_DATA_FL;
-                if (ip->i_di.di_flags & GFS2_DIF_DIRECTIO)
-                        fsflags |= FS_DIRECTIO_FL;
-        }
        if (put_user(fsflags, ptr))
                error = -EFAULT;
@@ -194,13 +189,11 @@ void gfs2_set_inode_flags(struct inode *inode)
 /* Flags that can be set by user space */
 #define GFS2_FLAGS_USER_SET (GFS2_DIF_JDATA|                    \
-                             GFS2_DIF_DIRECTIO|                 \
                             GFS2_DIF_IMMUTABLE|                \
                             GFS2_DIF_APPENDONLY|               \
                             GFS2_DIF_NOATIME|                  \
                             GFS2_DIF_SYNC|                     \
                             GFS2_DIF_SYSTEM|                   \
-                             GFS2_DIF_INHERIT_DIRECTIO|         \
                             GFS2_DIF_INHERIT_JDATA)
 /**
@@ -220,10 +213,14 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
        int error;
        u32 new_flags, flags;
-        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+        error = mnt_want_write(filp->f_path.mnt);
        if (error)
                return error;
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+        if (error)
+                goto out_drop_write;
        flags = ip->i_di.di_flags;
        new_flags = (flags & ~mask) | (reqflags & mask);
        if ((new_flags ^ flags) == 0)
@@ -242,7 +239,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
            !capable(CAP_LINUX_IMMUTABLE))
                goto out;
        if (!IS_IMMUTABLE(inode)) {
-                error = permission(inode, MAY_WRITE, NULL);
+                error = gfs2_permission(inode, MAY_WRITE);
                if (error)
                        goto out;
        }
@@ -272,6 +269,8 @@ out_trans_end:
        gfs2_trans_end(sdp);
 out:
        gfs2_glock_dq_uninit(&gh);
+out_drop_write:
+        mnt_drop_write(filp->f_path.mnt);
        return error;
 }
@@ -285,8 +284,6 @@ static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
        if (!S_ISDIR(inode->i_mode)) {
                if (gfsflags & GFS2_DIF_INHERIT_JDATA)
                        gfsflags ^= (GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA);
-                if (gfsflags & GFS2_DIF_INHERIT_DIRECTIO)
-                        gfsflags ^= (GFS2_DIF_DIRECTIO | GFS2_DIF_INHERIT_DIRECTIO);
                return do_gfs2_set_flags(filp, gfsflags, ~0);
        }
        return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_JDATA);
@@ -487,11 +484,6 @@ static int gfs2_open(struct inode *inode, struct file *file)
                        goto fail_gunlock;
                }
-                /* Listen to the Direct I/O flag */
-                if (ip->i_di.di_flags & GFS2_DIF_DIRECTIO)
-                        file->f_flags |= O_DIRECT;
                gfs2_glock_dq_uninit(&i_gh);
        }
@@ -669,8 +661,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
        int error = 0;
        state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
-        flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE 
+        flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE;
-                | GL_FLOCK;
        mutex_lock(&fp->f_fl_mutex);
@@ -683,9 +674,8 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
                gfs2_glock_dq_wait(fl_gh);
                gfs2_holder_reinit(state, flags, fl_gh);
        } else {
-                error = gfs2_glock_get(GFS2_SB(&ip->i_inode),
+                error = gfs2_glock_get(GFS2_SB(&ip->i_inode), ip->i_no_addr,
-                                      ip->i_no_addr, &gfs2_flock_glops,
+                                       &gfs2_flock_glops, CREATE, &gl);
-                                      CREATE, &gl);
                if (error)
                        goto out;
                gfs2_holder_init(gl, state, flags, fl_gh);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index b2028c82e8d1..b4d1d6490633 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -64,7 +64,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
        mutex_init(&sdp->sd_rindex_mutex);
        INIT_LIST_HEAD(&sdp->sd_rindex_list);
        INIT_LIST_HEAD(&sdp->sd_rindex_mru_list);
-        INIT_LIST_HEAD(&sdp->sd_rindex_recent_list);
        INIT_LIST_HEAD(&sdp->sd_jindex_list);
        spin_lock_init(&sdp->sd_jindex_spin);
@@ -364,6 +363,8 @@ static int map_journal_extents(struct gfs2_sbd *sdp)
 static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
 {
+        if (!sdp->sd_lockstruct.ls_ops->lm_others_may_mount)
+                return;
        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
                sdp->sd_lockstruct.ls_ops->lm_others_may_mount(
                                        sdp->sd_lockstruct.ls_lockspace);
@@ -741,8 +742,7 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
                goto out;
        }
-        if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lockspace) ||
+        if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) ||
-            gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) ||
            gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >=
                                  GFS2_MIN_LVB_SIZE)) {
                gfs2_unmount_lockproto(&sdp->sd_lockstruct);
@@ -873,7 +873,7 @@ fail_sb:
 fail_locking:
        init_locking(sdp, &mount_gh, UNDO);
 fail_lm:
-        gfs2_gl_hash_clear(sdp, WAIT);
+        gfs2_gl_hash_clear(sdp);
        gfs2_lm_unmount(sdp);
        while (invalidate_inodes(sb))
                yield();
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 2686ad4c0029..1e252dfc5294 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -163,7 +163,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
        if (error)
                goto out;
-        error = permission(dir, MAY_WRITE | MAY_EXEC, NULL);
+        error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC);
        if (error)
                goto out_gunlock;
@@ -669,7 +669,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
                        }
                }
        } else {
-                error = permission(ndir, MAY_WRITE | MAY_EXEC, NULL);
+                error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC);
                if (error)
                        goto out_gunlock;
@@ -704,7 +704,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
        /* Check out the dir to be renamed */
        if (dir_rename) {
-                error = permission(odentry->d_inode, MAY_WRITE, NULL);
+                error = gfs2_permission(odentry->d_inode, MAY_WRITE);
                if (error)
                        goto out_gunlock;
        }
@@ -891,7 +891,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
 * Returns: errno
 */
-static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
+int gfs2_permission(struct inode *inode, int mask)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_holder i_gh;
@@ -905,13 +905,22 @@ static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
                unlock = 1;
        }
-        error = generic_permission(inode, mask, gfs2_check_acl);
+        if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
+                error = -EACCES;
+        else
+                error = generic_permission(inode, mask, gfs2_check_acl);
        if (unlock)
                gfs2_glock_dq_uninit(&i_gh);
        return error;
 }
+static int gfs2_iop_permission(struct inode *inode, int mask,
+                               struct nameidata *nd)
+{
+        return gfs2_permission(inode, mask);
+}
 static int setattr_size(struct inode *inode, struct iattr *attr)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
@@ -1141,7 +1150,7 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
 }
 const struct inode_operations gfs2_file_iops = {
-        .permission = gfs2_permission,
+        .permission = gfs2_iop_permission,
        .setattr = gfs2_setattr,
        .getattr = gfs2_getattr,
        .setxattr = gfs2_setxattr,
@@ -1160,7 +1169,7 @@ const struct inode_operations gfs2_dir_iops = {
        .rmdir = gfs2_rmdir,
        .mknod = gfs2_mknod,
        .rename = gfs2_rename,
-        .permission = gfs2_permission,
+        .permission = gfs2_iop_permission,
        .setattr = gfs2_setattr,
        .getattr = gfs2_getattr,
        .setxattr = gfs2_setxattr,
@@ -1172,7 +1181,7 @@ const struct inode_operations gfs2_dir_iops = {
 const struct inode_operations gfs2_symlink_iops = {
        .readlink = gfs2_readlink,
        .follow_link = gfs2_follow_link,
-        .permission = gfs2_permission,
+        .permission = gfs2_iop_permission,
        .setattr = gfs2_setattr,
        .getattr = gfs2_getattr,
        .setxattr = gfs2_setxattr,
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 0b7cc920eb89..f66ea0f7a356 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -126,7 +126,7 @@ static void gfs2_put_super(struct super_block *sb)
        gfs2_clear_rgrpd(sdp);
        gfs2_jindex_free(sdp);
        /*  Take apart glock structures and buffer lists  */
-        gfs2_gl_hash_clear(sdp, WAIT);
+        gfs2_gl_hash_clear(sdp);
        /*  Unmount the locking protocol  */
        gfs2_lm_unmount(sdp);
@@ -155,7 +155,7 @@ static void gfs2_write_super(struct super_block *sb)
 static int gfs2_sync_fs(struct super_block *sb, int wait)
 {
        sb->s_dirt = 0;
-        if (wait)
+        if (wait && sb->s_fs_info)
                gfs2_log_flush(sb->s_fs_info, NULL);
        return 0;
 }
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 56aaf915c59a..3e073f5144fa 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -904,7 +904,7 @@ static int need_sync(struct gfs2_quota_data *qd)
                do_sync = 0;
        else {
                value *= gfs2_jindex_size(sdp) * num;
-                do_div(value, den);
+                value = div_s64(value, den);
                value += (s64)be64_to_cpu(qd->qd_qb.qb_value);
                if (value < (s64)be64_to_cpu(qd->qd_qb.qb_limit))
                        do_sync = 0;
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 2888e4b4b1c5..d5e91f4f6a0b 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -428,6 +428,9 @@ static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *hea
 static void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
                                  unsigned int message)
 {
+        if (!sdp->sd_lockstruct.ls_ops->lm_recovery_done)
+                return;
        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
                sdp->sd_lockstruct.ls_ops->lm_recovery_done(
                        sdp->sd_lockstruct.ls_lockspace, jid, message);
@@ -505,7 +508,7 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd)
                error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
                                           LM_FLAG_NOEXP | LM_FLAG_PRIORITY |
-                                           GL_NOCANCEL | GL_NOCACHE, &t_gh);
+                                           GL_NOCACHE, &t_gh);
                if (error)
                        goto fail_gunlock_ji;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 6387523a3153..2d90fb253505 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -195,7 +195,7 @@ ulong_aligned:
           depending on architecture.  I've experimented with several ways
           of writing this section such as using an else before the goto
           but this one seems to be the fastest. */
-        while ((unsigned char *)plong < end - 1) {
+        while ((unsigned char *)plong < end - sizeof(unsigned long)) {
                prefetch(plong + 1);
                if (((*plong) & LBITMASK) != lskipval)
                        break;
@@ -371,11 +371,6 @@ static void clear_rgrpdi(struct gfs2_sbd *sdp)
        spin_lock(&sdp->sd_rindex_spin);
        sdp->sd_rindex_forward = NULL;
-        head = &sdp->sd_rindex_recent_list;
-        while (!list_empty(head)) {
-                rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent);
-                list_del(&rgd->rd_recent);
-        }
        spin_unlock(&sdp->sd_rindex_spin);
        head = &sdp->sd_rindex_list;
@@ -945,107 +940,30 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked)
 }
 /**
- * recent_rgrp_first - get first RG from "recent" list
- * @sdp: The GFS2 superblock
- * @rglast: address of the rgrp used last
- *
- * Returns: The first rgrp in the recent list
- */
-static struct gfs2_rgrpd *recent_rgrp_first(struct gfs2_sbd *sdp,
-                                            u64 rglast)
-{
-        struct gfs2_rgrpd *rgd;
-        spin_lock(&sdp->sd_rindex_spin);
-        if (rglast) {
-                list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
-                        if (rgrp_contains_block(rgd, rglast))
-                                goto out;
-                }
-        }
-        rgd = NULL;
-        if (!list_empty(&sdp->sd_rindex_recent_list))
-                rgd = list_entry(sdp->sd_rindex_recent_list.next,
-                                 struct gfs2_rgrpd, rd_recent);
-out:
-        spin_unlock(&sdp->sd_rindex_spin);
-        return rgd;
-}
-/**
 * recent_rgrp_next - get next RG from "recent" list
 * @cur_rgd: current rgrp
- * @remove:
 *
 * Returns: The next rgrp in the recent list
 */
-static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd,
+static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd)
-                                           int remove)
 {
        struct gfs2_sbd *sdp = cur_rgd->rd_sbd;
        struct list_head *head;
        struct gfs2_rgrpd *rgd;
        spin_lock(&sdp->sd_rindex_spin);
+        head = &sdp->sd_rindex_mru_list;
-        head = &sdp->sd_rindex_recent_list;
+        if (unlikely(cur_rgd->rd_list_mru.next == head)) {
+                spin_unlock(&sdp->sd_rindex_spin);
-        list_for_each_entry(rgd, head, rd_recent) {
+                return NULL;
-                if (rgd == cur_rgd) {
-                        if (cur_rgd->rd_recent.next != head)
-                                rgd = list_entry(cur_rgd->rd_recent.next,
-                                                 struct gfs2_rgrpd, rd_recent);
-                        else
-                                rgd = NULL;
-                        if (remove)
-                                list_del(&cur_rgd->rd_recent);
-                        goto out;
-                }
        }
+        rgd = list_entry(cur_rgd->rd_list_mru.next, struct gfs2_rgrpd, rd_list_mru);
-        rgd = NULL;
-        if (!list_empty(head))
-                rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent);
-out:
        spin_unlock(&sdp->sd_rindex_spin);
        return rgd;
 }
 /**
- * recent_rgrp_add - add an RG to tail of "recent" list
- * @new_rgd: The rgrp to add
- *
- */
-static void recent_rgrp_add(struct gfs2_rgrpd *new_rgd)
-{
-        struct gfs2_sbd *sdp = new_rgd->rd_sbd;
-        struct gfs2_rgrpd *rgd;
-        unsigned int count = 0;
-        unsigned int max = sdp->sd_rgrps / gfs2_jindex_size(sdp);
-        spin_lock(&sdp->sd_rindex_spin);
-        list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
-                if (rgd == new_rgd)
-                        goto out;
-                if (++count >= max)
-                        goto out;
-        }
-        list_add_tail(&new_rgd->rd_recent, &sdp->sd_rindex_recent_list);
-out:
-        spin_unlock(&sdp->sd_rindex_spin);
-}
-/**
 * forward_rgrp_get - get an rgrp to try next from full list
 * @sdp: The GFS2 superblock
 *
@@ -1112,9 +1030,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
        int loops = 0;
        int error, rg_locked;
-        /* Try recently successful rgrps */
+        rgd = gfs2_blk2rgrpd(sdp, ip->i_goal);
-        rgd = recent_rgrp_first(sdp, ip->i_goal);
        while (rgd) {
                rg_locked = 0;
@@ -1136,11 +1052,9 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
                                gfs2_glock_dq_uninit(&al->al_rgd_gh);
                        if (inode)
                                return inode;
-                        rgd = recent_rgrp_next(rgd, 1);
+                        /* fall through */
-                        break;
                case GLR_TRYFAILED:
-                        rgd = recent_rgrp_next(rgd, 0);
+                        rgd = recent_rgrp_next(rgd);
                        break;
                default:
@@ -1199,7 +1113,9 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
 out:
        if (begin) {
-                recent_rgrp_add(rgd);
+                spin_lock(&sdp->sd_rindex_spin);
+                list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
+                spin_unlock(&sdp->sd_rindex_spin);
                rgd = gfs2_rgrpd_get_next(rgd);
                if (!rgd)
                        rgd = gfs2_rgrpd_get_first(sdp);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 7aeacbc65f35..63a8a902d9db 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -65,7 +65,6 @@ void gfs2_tune_init(struct gfs2_tune *gt)
        gt->gt_quota_quantum = 60;
        gt->gt_atime_quantum = 3600;
        gt->gt_new_files_jdata = 0;
-        gt->gt_new_files_directio = 0;
        gt->gt_max_readahead = 1 << 18;
        gt->gt_stall_secs = 600;
        gt->gt_complain_secs = 10;
@@ -941,8 +940,7 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp,
        }
        error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_DEFERRED,
-                               LM_FLAG_PRIORITY | GL_NOCACHE,
+                                   GL_NOCACHE, t_gh);
-                               t_gh);
        list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
                error = gfs2_jdesc_check(jd);
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 9ab9fc85ecd0..74846559fc3f 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -110,18 +110,6 @@ static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf,
        return len;
 }
-static ssize_t shrink_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
-{
-        if (!capable(CAP_SYS_ADMIN))
-                return -EACCES;
-        if (simple_strtol(buf, NULL, 0) != 1)
-                return -EINVAL;
-        gfs2_gl_hash_clear(sdp, NO_WAIT);
-        return len;
-}
 static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,
                                size_t len)
 {
@@ -175,7 +163,6 @@ static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store)
 GFS2_ATTR(id,                  0444, id_show,       NULL);
 GFS2_ATTR(fsname,              0444, fsname_show,   NULL);
 GFS2_ATTR(freeze,              0644, freeze_show,   freeze_store);
-GFS2_ATTR(shrink,              0200, NULL,          shrink_store);
 GFS2_ATTR(withdraw,            0644, withdraw_show, withdraw_store);
 GFS2_ATTR(statfs_sync,         0200, NULL,          statfs_sync_store);
 GFS2_ATTR(quota_sync,          0200, NULL,          quota_sync_store);
@@ -186,7 +173,6 @@ static struct attribute *gfs2_attrs[] = {
        &gfs2_attr_id.attr,
        &gfs2_attr_fsname.attr,
        &gfs2_attr_freeze.attr,
-        &gfs2_attr_shrink.attr,
        &gfs2_attr_withdraw.attr,
        &gfs2_attr_statfs_sync.attr,
        &gfs2_attr_quota_sync.attr,
@@ -426,7 +412,6 @@ TUNE_ATTR(max_readahead, 0);
 TUNE_ATTR(complain_secs, 0);
 TUNE_ATTR(statfs_slow, 0);
 TUNE_ATTR(new_files_jdata, 0);
-TUNE_ATTR(new_files_directio, 0);
 TUNE_ATTR(quota_simul_sync, 1);
 TUNE_ATTR(quota_cache_secs, 1);
 TUNE_ATTR(stall_secs, 1);
@@ -455,7 +440,6 @@ static struct attribute *tune_attrs[] = {
        &tune_attr_quotad_secs.attr,
        &tune_attr_quota_scale.attr,
        &tune_attr_new_files_jdata.attr,
-        &tune_attr_new_files_directio.attr,
        NULL,
 };
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 6914598022ce..91389c8aee8a 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -688,7 +688,6 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
        J_ASSERT(transaction->t_state == T_FINISHED);
        J_ASSERT(transaction->t_buffers == NULL);
-        J_ASSERT(transaction->t_sync_datalist == NULL);
        J_ASSERT(transaction->t_forget == NULL);
        J_ASSERT(transaction->t_iobuf_list == NULL);
        J_ASSERT(transaction->t_shadow_list == NULL);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index a2ed72f7ceee..f8b3be873226 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -22,6 +22,8 @@
 #include <linux/pagemap.h>
 #include <linux/jiffies.h>
 #include <linux/crc32.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
 /*
 * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -37,8 +39,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 }
 /*
- * When an ext3-ordered file is truncated, it is possible that many pages are
+ * When an ext4 file is truncated, it is possible that some pages are not
- * not sucessfully freed, because they are attached to a committing transaction.
+ * successfully freed, because they are attached to a committing transaction.
 * After the transaction commits, these pages are left on the LRU, with no
 * ->mapping, and with attached buffers.  These pages are trivially reclaimable
 * by the VM, but their apparent absence upsets the VM accounting, and it makes
@@ -80,21 +82,6 @@ nope:
 }
 /*
- * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
- * held.  For ranking reasons we must trylock.  If we lose, schedule away and
- * return 0.  j_list_lock is dropped in this case.
- */
-static int inverted_lock(journal_t *journal, struct buffer_head *bh)
-{
-        if (!jbd_trylock_bh_state(bh)) {
-                spin_unlock(&journal->j_list_lock);
-                schedule();
-                return 0;
-        }
-        return 1;
-}
-/*
 * Done it all: now submit the commit record.  We should have
 * cleaned up our previous buffers by now, so if we are in abort
 * mode we can now just skip the rest of the journal write
@@ -112,6 +99,7 @@ static int journal_submit_commit_record(journal_t *journal,
        struct buffer_head *bh;
        int ret;
        int barrier_done = 0;
+        struct timespec now = current_kernel_time();
        if (is_journal_aborted(journal))
                return 0;
@@ -126,6 +114,8 @@ static int journal_submit_commit_record(journal_t *journal,
        tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
        tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
        tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
+        tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
+        tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
        if (JBD2_HAS_COMPAT_FEATURE(journal,
                                    JBD2_FEATURE_COMPAT_CHECKSUM)) {
@@ -197,159 +187,104 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
 }
 /*
- * Wait for all submitted IO to complete.
+ * write the filemap data using writepage() address_space_operations.
+ * We don't do block allocation here even for delalloc. We don't
+ * use writepages() because with dealyed allocation we may be doing
+ * block allocation in writepages().
 */
-static int journal_wait_on_locked_list(journal_t *journal,
+static int journal_submit_inode_data_buffers(struct address_space *mapping)
-                                       transaction_t *commit_transaction)
 {
-        int ret = 0;
+        int ret;
-        struct journal_head *jh;
+        struct writeback_control wbc = {
+                .sync_mode =  WB_SYNC_ALL,
-        while (commit_transaction->t_locked_list) {
+                .nr_to_write = mapping->nrpages * 2,
-                struct buffer_head *bh;
+                .range_start = 0,
+                .range_end = i_size_read(mapping->host),
-                jh = commit_transaction->t_locked_list->b_tprev;
+                .for_writepages = 1,
-                bh = jh2bh(jh);
+        };
-                get_bh(bh);
-                if (buffer_locked(bh)) {
+        ret = generic_writepages(mapping, &wbc);
-                        spin_unlock(&journal->j_list_lock);
-                        wait_on_buffer(bh);
-                        if (unlikely(!buffer_uptodate(bh)))
-                                ret = -EIO;
-                        spin_lock(&journal->j_list_lock);
-                }
-                if (!inverted_lock(journal, bh)) {
-                        put_bh(bh);
-                        spin_lock(&journal->j_list_lock);
-                        continue;
-                }
-                if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
-                        __jbd2_journal_unfile_buffer(jh);
-                        jbd_unlock_bh_state(bh);
-                        jbd2_journal_remove_journal_head(bh);
-                        put_bh(bh);
-                } else {
-                        jbd_unlock_bh_state(bh);
-                }
-                put_bh(bh);
-                cond_resched_lock(&journal->j_list_lock);
-        }
        return ret;
-  }
+}
-static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
+/*
+ * Submit all the data buffers of inode associated with the transaction to
+ * disk.
+ *
+ * We are in a committing transaction. Therefore no new inode can be added to
+ * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
+ * operate on from being released while we write out pages.
+ */
+static int journal_submit_data_buffers(journal_t *journal,
+                transaction_t *commit_transaction)
 {
-        int i;
+        struct jbd2_inode *jinode;
+        int err, ret = 0;
+        struct address_space *mapping;
-        for (i = 0; i < bufs; i++) {
+        spin_lock(&journal->j_list_lock);
-                wbuf[i]->b_end_io = end_buffer_write_sync;
+        list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
-                /* We use-up our safety reference in submit_bh() */
+                mapping = jinode->i_vfs_inode->i_mapping;
-                submit_bh(WRITE, wbuf[i]);
+                jinode->i_flags |= JI_COMMIT_RUNNING;
+                spin_unlock(&journal->j_list_lock);
+                /*
+                 * submit the inode data buffers. We use writepage
+                 * instead of writepages. Because writepages can do
+                 * block allocation  with delalloc. We need to write
+                 * only allocated blocks here.
+                 */
+                err = journal_submit_inode_data_buffers(mapping);
+                if (!ret)
+                        ret = err;
+                spin_lock(&journal->j_list_lock);
+                J_ASSERT(jinode->i_transaction == commit_transaction);
+                jinode->i_flags &= ~JI_COMMIT_RUNNING;
+                wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
        }
+        spin_unlock(&journal->j_list_lock);
+        return ret;
 }
 /*
- *  Submit all the data buffers to disk
+ * Wait for data submitted for writeout, refile inodes to proper
+ * transaction if needed.
+ *
 */
-static void journal_submit_data_buffers(journal_t *journal,
+static int journal_finish_inode_data_buffers(journal_t *journal,
-                                transaction_t *commit_transaction)
+                transaction_t *commit_transaction)
 {
-        struct journal_head *jh;
+        struct jbd2_inode *jinode, *next_i;
-        struct buffer_head *bh;
+        int err, ret = 0;
-        int locked;
-        int bufs = 0;
-        struct buffer_head **wbuf = journal->j_wbuf;
-        /*
+        /* For locking, see the comment in journal_submit_data_buffers() */
-         * Whenever we unlock the journal and sleep, things can get added
-         * onto ->t_sync_datalist, so we have to keep looping back to
-         * write_out_data until we *know* that the list is empty.
-         *
-         * Cleanup any flushed data buffers from the data list.  Even in
-         * abort mode, we want to flush this out as soon as possible.
-         */
-write_out_data:
-        cond_resched();
        spin_lock(&journal->j_list_lock);
+        list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+                jinode->i_flags |= JI_COMMIT_RUNNING;
+                spin_unlock(&journal->j_list_lock);
+                err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
+                if (!ret)
+                        ret = err;
+                spin_lock(&journal->j_list_lock);
+                jinode->i_flags &= ~JI_COMMIT_RUNNING;
+                wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
+        }
-        while (commit_transaction->t_sync_datalist) {
+        /* Now refile inode to proper lists */
-                jh = commit_transaction->t_sync_datalist;
+        list_for_each_entry_safe(jinode, next_i,
-                bh = jh2bh(jh);
+                                 &commit_transaction->t_inode_list, i_list) {
-                locked = 0;
+                list_del(&jinode->i_list);
+                if (jinode->i_next_transaction) {
-                /* Get reference just to make sure buffer does not disappear
+                        jinode->i_transaction = jinode->i_next_transaction;
-                 * when we are forced to drop various locks */
+                        jinode->i_next_transaction = NULL;
-                get_bh(bh);
+                        list_add(&jinode->i_list,
-                /* If the buffer is dirty, we need to submit IO and hence
+                                &jinode->i_transaction->t_inode_list);
-                 * we need the buffer lock. We try to lock the buffer without
-                 * blocking. If we fail, we need to drop j_list_lock and do
-                 * blocking lock_buffer().
-                 */
-                if (buffer_dirty(bh)) {
-                        if (test_set_buffer_locked(bh)) {
-                                BUFFER_TRACE(bh, "needs blocking lock");
-                                spin_unlock(&journal->j_list_lock);
-                                /* Write out all data to prevent deadlocks */
-                                journal_do_submit_data(wbuf, bufs);
-                                bufs = 0;
-                                lock_buffer(bh);
-                                spin_lock(&journal->j_list_lock);
-                        }
-                        locked = 1;
-                }
-                /* We have to get bh_state lock. Again out of order, sigh. */
-                if (!inverted_lock(journal, bh)) {
-                        jbd_lock_bh_state(bh);
-                        spin_lock(&journal->j_list_lock);
-                }
-                /* Someone already cleaned up the buffer? */
-                if (!buffer_jbd(bh)
-                        || jh->b_transaction != commit_transaction
-                        || jh->b_jlist != BJ_SyncData) {
-                        jbd_unlock_bh_state(bh);
-                        if (locked)
-                                unlock_buffer(bh);
-                        BUFFER_TRACE(bh, "already cleaned up");
-                        put_bh(bh);
-                        continue;
-                }
-                if (locked && test_clear_buffer_dirty(bh)) {
-                        BUFFER_TRACE(bh, "needs writeout, adding to array");
-                        wbuf[bufs++] = bh;
-                        __jbd2_journal_file_buffer(jh, commit_transaction,
-                                                BJ_Locked);
-                        jbd_unlock_bh_state(bh);
-                        if (bufs == journal->j_wbufsize) {
-                                spin_unlock(&journal->j_list_lock);
-                                journal_do_submit_data(wbuf, bufs);
-                                bufs = 0;
-                                goto write_out_data;
-                        }
-                } else if (!locked && buffer_locked(bh)) {
-                        __jbd2_journal_file_buffer(jh, commit_transaction,
-                                                BJ_Locked);
-                        jbd_unlock_bh_state(bh);
-                        put_bh(bh);
                } else {
-                        BUFFER_TRACE(bh, "writeout complete: unfile");
+                        jinode->i_transaction = NULL;
-                        __jbd2_journal_unfile_buffer(jh);
-                        jbd_unlock_bh_state(bh);
-                        if (locked)
-                                unlock_buffer(bh);
-                        jbd2_journal_remove_journal_head(bh);
-                        /* Once for our safety reference, once for
-                         * jbd2_journal_remove_journal_head() */
-                        put_bh(bh);
-                        put_bh(bh);
-                }
-                if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
-                        spin_unlock(&journal->j_list_lock);
-                        goto write_out_data;
                }
        }
        spin_unlock(&journal->j_list_lock);
-        journal_do_submit_data(wbuf, bufs);
+        return ret;
 }
 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
@@ -524,21 +459,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
         * Now start flushing things to disk, in the order they appear
         * on the transaction lists.  Data blocks go first.
         */
-        err = 0;
+        err = journal_submit_data_buffers(journal, commit_transaction);
-        journal_submit_data_buffers(journal, commit_transaction);
-        /*
-         * Wait for all previously submitted IO to complete if commit
-         * record is to be written synchronously.
-         */
-        spin_lock(&journal->j_list_lock);
-        if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
-                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
-                err = journal_wait_on_locked_list(journal,
-                                                commit_transaction);
-        spin_unlock(&journal->j_list_lock);
        if (err)
                jbd2_journal_abort(journal, err);
@@ -547,16 +468,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        jbd_debug(3, "JBD: commit phase 2\n");
        /*
-         * If we found any dirty or locked buffers, then we should have
-         * looped back up to the write_out_data label.  If there weren't
-         * any then journal_clean_data_list should have wiped the list
-         * clean by now, so check that it is in fact empty.
-         */
-        J_ASSERT (commit_transaction->t_sync_datalist == NULL);
-        jbd_debug (3, "JBD: commit phase 3\n");
-        /*
         * Way to go: we have now written out all of the data for a
         * transaction!  Now comes the tricky part: we need to write out
         * metadata.  Loop over the transaction's entire buffer list:
@@ -574,6 +485,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        J_ASSERT(commit_transaction->t_nr_buffers <=
                 commit_transaction->t_outstanding_credits);
+        err = 0;
        descriptor = NULL;
        bufs = 0;
        while (commit_transaction->t_buffers) {
@@ -748,15 +660,19 @@ start_journal_io:
                                                 &cbh, crc32_sum);
                if (err)
                        __jbd2_journal_abort_hard(journal);
-                spin_lock(&journal->j_list_lock);
-                err = journal_wait_on_locked_list(journal,
-                                                commit_transaction);
-                spin_unlock(&journal->j_list_lock);
-                if (err)
-                        __jbd2_journal_abort_hard(journal);
        }
+        /*
+         * This is the right place to wait for data buffers both for ASYNC
+         * and !ASYNC commit. If commit is ASYNC, we need to wait only after
+         * the commit block went to disk (which happens above). If commit is
+         * SYNC, we need to wait for data buffers before we start writing
+         * commit block, which happens below in such setting.
+         */
+        err = journal_finish_inode_data_buffers(journal, commit_transaction);
+        if (err)
+                jbd2_journal_abort(journal, err);
        /* Lo and behold: we have just managed to send a transaction to
           the log.  Before we can commit it, wait for the IO so far to
           complete.  Control buffers being written are on the
@@ -768,7 +684,7 @@ start_journal_io:
           so we incur less scheduling load.
        */
-        jbd_debug(3, "JBD: commit phase 4\n");
+        jbd_debug(3, "JBD: commit phase 3\n");
        /*
         * akpm: these are BJ_IO, and j_list_lock is not needed.
@@ -827,7 +743,7 @@ wait_for_iobuf:
        J_ASSERT (commit_transaction->t_shadow_list == NULL);
-        jbd_debug(3, "JBD: commit phase 5\n");
+        jbd_debug(3, "JBD: commit phase 4\n");
        /* Here we wait for the revoke record and descriptor record buffers */
 wait_for_ctlbuf:
@@ -854,7 +770,7 @@ wait_for_iobuf:
                /* AKPM: bforget here */
        }
-        jbd_debug(3, "JBD: commit phase 6\n");
+        jbd_debug(3, "JBD: commit phase 5\n");
        if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
@@ -874,9 +790,9 @@ wait_for_iobuf:
           transaction can be removed from any checkpoint list it was on
           before. */
-        jbd_debug(3, "JBD: commit phase 7\n");
+        jbd_debug(3, "JBD: commit phase 6\n");
-        J_ASSERT(commit_transaction->t_sync_datalist == NULL);
+        J_ASSERT(list_empty(&commit_transaction->t_inode_list));
        J_ASSERT(commit_transaction->t_buffers == NULL);
        J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
        J_ASSERT(commit_transaction->t_iobuf_list == NULL);
@@ -997,7 +913,7 @@ restart_loop:
        /* Done with this transaction! */
-        jbd_debug(3, "JBD: commit phase 8\n");
+        jbd_debug(3, "JBD: commit phase 7\n");
        J_ASSERT(commit_transaction->t_state == T_COMMIT);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 2e24567c4a79..b26c6d9fe6ae 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -50,7 +50,6 @@ EXPORT_SYMBOL(jbd2_journal_unlock_updates);
 EXPORT_SYMBOL(jbd2_journal_get_write_access);
 EXPORT_SYMBOL(jbd2_journal_get_create_access);
 EXPORT_SYMBOL(jbd2_journal_get_undo_access);
-EXPORT_SYMBOL(jbd2_journal_dirty_data);
 EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
 EXPORT_SYMBOL(jbd2_journal_release_buffer);
 EXPORT_SYMBOL(jbd2_journal_forget);
@@ -82,6 +81,10 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
 EXPORT_SYMBOL(jbd2_journal_invalidatepage);
 EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
 EXPORT_SYMBOL(jbd2_journal_force_commit);
+EXPORT_SYMBOL(jbd2_journal_file_inode);
+EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
+EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
+EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
 static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
 static void __journal_abort_soft (journal_t *journal, int errno);
@@ -2195,6 +2198,54 @@ void jbd2_journal_put_journal_head(struct journal_head *jh)
 }
 /*
+ * Initialize jbd inode head
+ */
+void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode)
+{
+        jinode->i_transaction = NULL;
+        jinode->i_next_transaction = NULL;
+        jinode->i_vfs_inode = inode;
+        jinode->i_flags = 0;
+        INIT_LIST_HEAD(&jinode->i_list);
+}
+/*
+ * Function to be called before we start removing inode from memory (i.e.,
+ * clear_inode() is a fine place to be called from). It removes inode from
+ * transaction's lists.
+ */
+void jbd2_journal_release_jbd_inode(journal_t *journal,
+                                    struct jbd2_inode *jinode)
+{
+        int writeout = 0;
+        if (!journal)
+                return;
+restart:
+        spin_lock(&journal->j_list_lock);
+        /* Is commit writing out inode - we have to wait */
+        if (jinode->i_flags & JI_COMMIT_RUNNING) {
+                wait_queue_head_t *wq;
+                DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
+                wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
+                prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+                spin_unlock(&journal->j_list_lock);
+                schedule();
+                finish_wait(wq, &wait.wait);
+                goto restart;
+        }
+        /* Do we need to wait for data writeback? */
+        if (journal->j_committing_transaction == jinode->i_transaction)
+                writeout = 1;
+        if (jinode->i_transaction) {
+                list_del(&jinode->i_list);
+                jinode->i_transaction = NULL;
+        }
+        spin_unlock(&journal->j_list_lock);
+}
+/*
 * debugfs tunables
 */
 #ifdef CONFIG_JBD2_DEBUG
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index d6e006e67804..4f7cadbb19fa 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -41,7 +41,6 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
 *      new transaction and we can't block without protecting against other
 *      processes trying to touch the journal while it is in transition.
 *
- * Called under j_state_lock
 */
 static transaction_t *
@@ -52,6 +51,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
        transaction->t_tid = journal->j_transaction_sequence++;
        transaction->t_expires = jiffies + journal->j_commit_interval;
        spin_lock_init(&transaction->t_handle_lock);
+        INIT_LIST_HEAD(&transaction->t_inode_list);
        /* Set up the commit timer for the new transaction. */
        journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
@@ -943,183 +943,6 @@ out:
 }
 /**
- * int jbd2_journal_dirty_data() -  mark a buffer as containing dirty data which
- *                             needs to be flushed before we can commit the
- *                             current transaction.
- * @handle: transaction
- * @bh: bufferhead to mark
- *
- * The buffer is placed on the transaction's data list and is marked as
- * belonging to the transaction.
- *
- * Returns error number or 0 on success.
- *
- * jbd2_journal_dirty_data() can be called via page_launder->ext3_writepage
- * by kswapd.
- */
-int jbd2_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
-{
-        journal_t *journal = handle->h_transaction->t_journal;
-        int need_brelse = 0;
-        struct journal_head *jh;
-        if (is_handle_aborted(handle))
-                return 0;
-        jh = jbd2_journal_add_journal_head(bh);
-        JBUFFER_TRACE(jh, "entry");
-        /*
-         * The buffer could *already* be dirty.  Writeout can start
-         * at any time.
-         */
-        jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid);
-        /*
-         * What if the buffer is already part of a running transaction?
-         *
-         * There are two cases:
-         * 1) It is part of the current running transaction.  Refile it,
-         *    just in case we have allocated it as metadata, deallocated
-         *    it, then reallocated it as data.
-         * 2) It is part of the previous, still-committing transaction.
-         *    If all we want to do is to guarantee that the buffer will be
-         *    written to disk before this new transaction commits, then
-         *    being sure that the *previous* transaction has this same
-         *    property is sufficient for us!  Just leave it on its old
-         *    transaction.
-         *
-         * In case (2), the buffer must not already exist as metadata
-         * --- that would violate write ordering (a transaction is free
-         * to write its data at any point, even before the previous
-         * committing transaction has committed).  The caller must
-         * never, ever allow this to happen: there's nothing we can do
-         * about it in this layer.
-         */
-        jbd_lock_bh_state(bh);
-        spin_lock(&journal->j_list_lock);
-        /* Now that we have bh_state locked, are we really still mapped? */
-        if (!buffer_mapped(bh)) {
-                JBUFFER_TRACE(jh, "unmapped buffer, bailing out");
-                goto no_journal;
-        }
-        if (jh->b_transaction) {
-                JBUFFER_TRACE(jh, "has transaction");
-                if (jh->b_transaction != handle->h_transaction) {
-                        JBUFFER_TRACE(jh, "belongs to older transaction");
-                        J_ASSERT_JH(jh, jh->b_transaction ==
-                                        journal->j_committing_transaction);
-                        /* @@@ IS THIS TRUE  ? */
-                        /*
-                         * Not any more.  Scenario: someone does a write()
-                         * in data=journal mode.  The buffer's transaction has
-                         * moved into commit.  Then someone does another
-                         * write() to the file.  We do the frozen data copyout
-                         * and set b_next_transaction to point to j_running_t.
-                         * And while we're in that state, someone does a
-                         * writepage() in an attempt to pageout the same area
-                         * of the file via a shared mapping.  At present that
-                         * calls jbd2_journal_dirty_data(), and we get right here.
-                         * It may be too late to journal the data.  Simply
-                         * falling through to the next test will suffice: the
-                         * data will be dirty and wil be checkpointed.  The
-                         * ordering comments in the next comment block still
-                         * apply.
-                         */
-                        //J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
-                        /*
-                         * If we're journalling data, and this buffer was
-                         * subject to a write(), it could be metadata, forget
-                         * or shadow against the committing transaction.  Now,
-                         * someone has dirtied the same darn page via a mapping
-                         * and it is being writepage()'d.
-                         * We *could* just steal the page from commit, with some
-                         * fancy locking there.  Instead, we just skip it -
-                         * don't tie the page's buffers to the new transaction
-                         * at all.
-                         * Implication: if we crash before the writepage() data
-                         * is written into the filesystem, recovery will replay
-                         * the write() data.
-                         */
-                        if (jh->b_jlist != BJ_None &&
-                                        jh->b_jlist != BJ_SyncData &&
-                                        jh->b_jlist != BJ_Locked) {
-                                JBUFFER_TRACE(jh, "Not stealing");
-                                goto no_journal;
-                        }
-                        /*
-                         * This buffer may be undergoing writeout in commit.  We
-                         * can't return from here and let the caller dirty it
-                         * again because that can cause the write-out loop in
-                         * commit to never terminate.
-                         */
-                        if (buffer_dirty(bh)) {
-                                get_bh(bh);
-                                spin_unlock(&journal->j_list_lock);
-                                jbd_unlock_bh_state(bh);
-                                need_brelse = 1;
-                                sync_dirty_buffer(bh);
-                                jbd_lock_bh_state(bh);
-                                spin_lock(&journal->j_list_lock);
-                                /* Since we dropped the lock... */
-                                if (!buffer_mapped(bh)) {
-                                        JBUFFER_TRACE(jh, "buffer got unmapped");
-                                        goto no_journal;
-                                }
-                                /* The buffer may become locked again at any
-                                   time if it is redirtied */
-                        }
-                        /* journal_clean_data_list() may have got there first */
-                        if (jh->b_transaction != NULL) {
-                                JBUFFER_TRACE(jh, "unfile from commit");
-                                __jbd2_journal_temp_unlink_buffer(jh);
-                                /* It still points to the committing
-                                 * transaction; move it to this one so
-                                 * that the refile assert checks are
-                                 * happy. */
-                                jh->b_transaction = handle->h_transaction;
-                        }
-                        /* The buffer will be refiled below */
-                }
-                /*
-                 * Special case --- the buffer might actually have been
-                 * allocated and then immediately deallocated in the previous,
-                 * committing transaction, so might still be left on that
-                 * transaction's metadata lists.
-                 */
-                if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
-                        JBUFFER_TRACE(jh, "not on correct data list: unfile");
-                        J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
-                        __jbd2_journal_temp_unlink_buffer(jh);
-                        jh->b_transaction = handle->h_transaction;
-                        JBUFFER_TRACE(jh, "file as data");
-                        __jbd2_journal_file_buffer(jh, handle->h_transaction,
-                                                BJ_SyncData);
-                }
-        } else {
-                JBUFFER_TRACE(jh, "not on a transaction");
-                __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_SyncData);
-        }
-no_journal:
-        spin_unlock(&journal->j_list_lock);
-        jbd_unlock_bh_state(bh);
-        if (need_brelse) {
-                BUFFER_TRACE(bh, "brelse");
-                __brelse(bh);
-        }
-        JBUFFER_TRACE(jh, "exit");
-        jbd2_journal_put_journal_head(jh);
-        return 0;
-}
-/**
 * int jbd2_journal_dirty_metadata() -  mark a buffer as containing dirty metadata
 * @handle: transaction to add buffer to.
 * @bh: buffer to mark
@@ -1541,10 +1364,10 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
 * Remove a buffer from the appropriate transaction list.
 *
 * Note that this function can *change* the value of
- * bh->b_transaction->t_sync_datalist, t_buffers, t_forget,
+ * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list,
- * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list.  If the caller
+ * t_log_list or t_reserved_list.  If the caller is holding onto a copy of one
- * is holding onto a copy of one of thee pointers, it could go bad.
+ * of these pointers, it could go bad.  Generally the caller needs to re-read
- * Generally the caller needs to re-read the pointer from the transaction_t.
+ * the pointer from the transaction_t.
 *
 * Called under j_list_lock.  The journal may not be locked.
 */
@@ -1566,9 +1389,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
        switch (jh->b_jlist) {
        case BJ_None:
                return;
-        case BJ_SyncData:
-                list = &transaction->t_sync_datalist;
-                break;
        case BJ_Metadata:
                transaction->t_nr_buffers--;
                J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
@@ -1589,9 +1409,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
        case BJ_Reserved:
                list = &transaction->t_reserved_list;
                break;
-        case BJ_Locked:
-                list = &transaction->t_locked_list;
-                break;
        }
        __blist_del_buffer(list, jh);
@@ -1634,15 +1451,7 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
                goto out;
        spin_lock(&journal->j_list_lock);
-        if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) {
+        if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
-                if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
-                        /* A written-back ordered data buffer */
-                        JBUFFER_TRACE(jh, "release data");
-                        __jbd2_journal_unfile_buffer(jh);
-                        jbd2_journal_remove_journal_head(bh);
-                        __brelse(bh);
-                }
-        } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
                /* written-back checkpointed metadata buffer */
                if (jh->b_jlist == BJ_None) {
                        JBUFFER_TRACE(jh, "remove from checkpoint list");
@@ -1656,12 +1465,43 @@ out:
        return;
 }
+/*
+ * jbd2_journal_try_to_free_buffers() could race with
+ * jbd2_journal_commit_transaction(). The later might still hold the
+ * reference count to the buffers when inspecting them on
+ * t_syncdata_list or t_locked_list.
+ *
+ * jbd2_journal_try_to_free_buffers() will call this function to
+ * wait for the current transaction to finish syncing data buffers, before
+ * try to free that buffer.
+ *
+ * Called with journal->j_state_lock hold.
+ */
+static void jbd2_journal_wait_for_transaction_sync_data(journal_t *journal)
+{
+        transaction_t *transaction;
+        tid_t tid;
+        spin_lock(&journal->j_state_lock);
+        transaction = journal->j_committing_transaction;
+        if (!transaction) {
+                spin_unlock(&journal->j_state_lock);
+                return;
+        }
+        tid = transaction->t_tid;
+        spin_unlock(&journal->j_state_lock);
+        jbd2_log_wait_commit(journal, tid);
+}
 /**
 * int jbd2_journal_try_to_free_buffers() - try to free page buffers.
 * @journal: journal for operation
 * @page: to try and free
- * @unused_gfp_mask: unused
+ * @gfp_mask: we use the mask to detect how hard should we try to release
+ * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
+ * release the buffers.
 *
 *
 * For all the buffers on this page,
@@ -1690,9 +1530,11 @@ out:
 * journal_try_to_free_buffer() is changing its state.  But that
 * cannot happen because we never reallocate freed data as metadata
 * while the data is part of a transaction.  Yes?
+ *
+ * Return 0 on failure, 1 on success
 */
 int jbd2_journal_try_to_free_buffers(journal_t *journal,
-                                struct page *page, gfp_t unused_gfp_mask)
+                                struct page *page, gfp_t gfp_mask)
 {
        struct buffer_head *head;
        struct buffer_head *bh;
@@ -1708,7 +1550,8 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
                /*
                 * We take our own ref against the journal_head here to avoid
                 * having to add tons of locking around each instance of
-                 * jbd2_journal_remove_journal_head() and jbd2_journal_put_journal_head().
+                 * jbd2_journal_remove_journal_head() and
+                 * jbd2_journal_put_journal_head().
                 */
                jh = jbd2_journal_grab_journal_head(bh);
                if (!jh)
@@ -1721,7 +1564,28 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
                if (buffer_jbd(bh))
                        goto busy;
        } while ((bh = bh->b_this_page) != head);
        ret = try_to_free_buffers(page);
+        /*
+         * There are a number of places where jbd2_journal_try_to_free_buffers()
+         * could race with jbd2_journal_commit_transaction(), the later still
+         * holds the reference to the buffers to free while processing them.
+         * try_to_free_buffers() failed to free those buffers. Some of the
+         * caller of releasepage() request page buffers to be dropped, otherwise
+         * treat the fail-to-free as errors (such as generic_file_direct_IO())
+         *
+         * So, if the caller of try_to_release_page() wants the synchronous
+         * behaviour(i.e make sure buffers are dropped upon return),
+         * let's wait for the current transaction to finish flush of
+         * dirty data buffers, then try to free those buffers again,
+         * with the journal locked.
+         */
+        if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) {
+                jbd2_journal_wait_for_transaction_sync_data(journal);
+                ret = try_to_free_buffers(page);
+        }
 busy:
        return ret;
 }
@@ -1823,6 +1687,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
        if (!buffer_jbd(bh))
                goto zap_buffer_unlocked;
+        /* OK, we have data buffer in journaled mode */
        spin_lock(&journal->j_state_lock);
        jbd_lock_bh_state(bh);
        spin_lock(&journal->j_list_lock);
@@ -1886,15 +1751,6 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
                }
        } else if (transaction == journal->j_committing_transaction) {
                JBUFFER_TRACE(jh, "on committing transaction");
-                if (jh->b_jlist == BJ_Locked) {
-                        /*
-                         * The buffer is on the committing transaction's locked
-                         * list.  We have the buffer locked, so I/O has
-                         * completed.  So we can nail the buffer now.
-                         */
-                        may_free = __dispose_buffer(jh, transaction);
-                        goto zap_buffer;
-                }
                /*
                 * If it is committing, we simply cannot touch it.  We
                 * can remove it's next_transaction pointer from the
@@ -2027,9 +1883,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
                J_ASSERT_JH(jh, !jh->b_committed_data);
                J_ASSERT_JH(jh, !jh->b_frozen_data);
                return;
-        case BJ_SyncData:
-                list = &transaction->t_sync_datalist;
-                break;
        case BJ_Metadata:
                transaction->t_nr_buffers++;
                list = &transaction->t_buffers;
@@ -2049,9 +1902,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
        case BJ_Reserved:
                list = &transaction->t_reserved_list;
                break;
-        case BJ_Locked:
-                list =  &transaction->t_locked_list;
-                break;
        }
        __blist_add_buffer(list, jh);
@@ -2141,3 +1991,88 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
        spin_unlock(&journal->j_list_lock);
        __brelse(bh);
 }
+/*
+ * File inode in the inode list of the handle's transaction
+ */
+int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
+{
+        transaction_t *transaction = handle->h_transaction;
+        journal_t *journal = transaction->t_journal;
+        if (is_handle_aborted(handle))
+                return -EIO;
+        jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
+                        transaction->t_tid);
+        /*
+         * First check whether inode isn't already on the transaction's
+         * lists without taking the lock. Note that this check is safe
+         * without the lock as we cannot race with somebody removing inode
+         * from the transaction. The reason is that we remove inode from the
+         * transaction only in journal_release_jbd_inode() and when we commit
+         * the transaction. We are guarded from the first case by holding
+         * a reference to the inode. We are safe against the second case
+         * because if jinode->i_transaction == transaction, commit code
+         * cannot touch the transaction because we hold reference to it,
+         * and if jinode->i_next_transaction == transaction, commit code
+         * will only file the inode where we want it.
+         */
+        if (jinode->i_transaction == transaction ||
+            jinode->i_next_transaction == transaction)
+                return 0;
+        spin_lock(&journal->j_list_lock);
+        if (jinode->i_transaction == transaction ||
+            jinode->i_next_transaction == transaction)
+                goto done;
+        /* On some different transaction's list - should be
+         * the committing one */
+        if (jinode->i_transaction) {
+                J_ASSERT(jinode->i_next_transaction == NULL);
+                J_ASSERT(jinode->i_transaction ==
+                                        journal->j_committing_transaction);
+                jinode->i_next_transaction = transaction;
+                goto done;
+        }
+        /* Not on any transaction list... */
+        J_ASSERT(!jinode->i_next_transaction);
+        jinode->i_transaction = transaction;
+        list_add(&jinode->i_list, &transaction->t_inode_list);
+done:
+        spin_unlock(&journal->j_list_lock);
+        return 0;
+}
+/*
+ * This function must be called when inode is journaled in ordered mode
+ * before truncation happens. It starts writeout of truncated part in
+ * case it is in the committing transaction so that we stand to ordered
+ * mode consistency guarantees.
+ */
+int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
+                                        loff_t new_size)
+{
+        journal_t *journal;
+        transaction_t *commit_trans;
+        int ret = 0;
+        if (!inode->i_transaction && !inode->i_next_transaction)
+                goto out;
+        journal = inode->i_transaction->t_journal;
+        spin_lock(&journal->j_state_lock);
+        commit_trans = journal->j_committing_transaction;
+        spin_unlock(&journal->j_state_lock);
+        if (inode->i_transaction == commit_trans) {
+                ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping,
+                        new_size, LLONG_MAX);
+                if (ret)
+                        jbd2_journal_abort(journal, ret);
+        }
+out:
+        return ret;
+}
diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c
index bf6ab19b86ee..6a73de84bcef 100644
--- a/fs/jfs/jfs_debug.c
+++ b/fs/jfs/jfs_debug.c
@@ -21,6 +21,7 @@
 #include <linux/ctype.h>
 #include <linux/module.h>
 #include <linux/proc_fs.h>
+#include <linux/seq_file.h>
 #include <asm/uaccess.h>
 #include "jfs_incore.h"
 #include "jfs_filsys.h"
@@ -30,29 +31,19 @@
 static struct proc_dir_entry *base;
 #ifdef CONFIG_JFS_DEBUG
-static int loglevel_read(char *page, char **start, off_t off,
+static int jfs_loglevel_proc_show(struct seq_file *m, void *v)
-                         int count, int *eof, void *data)
 {
-        int len;
+        seq_printf(m, "%d\n", jfsloglevel);
+        return 0;
-        len = sprintf(page, "%d\n", jfsloglevel);
+}
-        len -= off;
-        *start = page + off;
-        if (len > count)
-                len = count;
-        else
-                *eof = 1;
-        if (len < 0)
-                len = 0;
-        return len;
+static int jfs_loglevel_proc_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, jfs_loglevel_proc_show, NULL);
 }
-static int loglevel_write(struct file *file, const char __user *buffer,
+static ssize_t jfs_loglevel_proc_write(struct file *file,
-                        unsigned long count, void *data)
+                const char __user *buffer, size_t count, loff_t *ppos)
 {
        char c;
@@ -65,22 +56,30 @@ static int loglevel_write(struct file *file, const char __user *buffer,
        jfsloglevel = c - '0';
        return count;
 }
+static const struct file_operations jfs_loglevel_proc_fops = {
+        .owner          = THIS_MODULE,
+        .open           = jfs_loglevel_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+        .write          = jfs_loglevel_proc_write,
+};
 #endif
 static struct {
        const char      *name;
-        read_proc_t     *read_fn;
+        const struct file_operations *proc_fops;
-        write_proc_t    *write_fn;
 } Entries[] = {
 #ifdef CONFIG_JFS_STATISTICS
-        { "lmstats",    jfs_lmstats_read, },
+        { "lmstats",    &jfs_lmstats_proc_fops, },
-        { "txstats",    jfs_txstats_read, },
+        { "txstats",    &jfs_txstats_proc_fops, },
-        { "xtstat",     jfs_xtstat_read, },
+        { "xtstat",     &jfs_xtstat_proc_fops, },
-        { "mpstat",     jfs_mpstat_read, },
+        { "mpstat",     &jfs_mpstat_proc_fops, },
 #endif
 #ifdef CONFIG_JFS_DEBUG
-        { "TxAnchor",   jfs_txanchor_read, },
+        { "TxAnchor",   &jfs_txanchor_proc_fops, },
-        { "loglevel",   loglevel_read, loglevel_write }
+        { "loglevel",   &jfs_loglevel_proc_fops }
 #endif
 };
 #define NPROCENT        ARRAY_SIZE(Entries)
@@ -93,13 +92,8 @@ void jfs_proc_init(void)
                return;
        base->owner = THIS_MODULE;
-        for (i = 0; i < NPROCENT; i++) {
+        for (i = 0; i < NPROCENT; i++)
-                struct proc_dir_entry *p;
+                proc_create(Entries[i].name, 0, base, Entries[i].proc_fops);
-                if ((p = create_proc_entry(Entries[i].name, 0, base))) {
-                        p->read_proc = Entries[i].read_fn;
-                        p->write_proc = Entries[i].write_fn;
-                }
-        }
 }
 void jfs_proc_clean(void)
diff --git a/fs/jfs/jfs_debug.h b/fs/jfs/jfs_debug.h
index 044c1e654cc0..eafd1300a00b 100644
--- a/fs/jfs/jfs_debug.h
+++ b/fs/jfs/jfs_debug.h
@@ -62,7 +62,7 @@ extern void jfs_proc_clean(void);
 extern int jfsloglevel;
-extern int jfs_txanchor_read(char *, char **, off_t, int, int *, void *);
+extern const struct file_operations jfs_txanchor_proc_fops;
 /* information message: e.g., configuration, major event */
 #define jfs_info(fmt, arg...) do {                      \
@@ -105,10 +105,10 @@ extern int jfs_txanchor_read(char *, char **, off_t, int, int *, void *);
 *      ----------
 */
 #ifdef  CONFIG_JFS_STATISTICS
-extern int jfs_lmstats_read(char *, char **, off_t, int, int *, void *);
+extern const struct file_operations jfs_lmstats_proc_fops;
-extern int jfs_txstats_read(char *, char **, off_t, int, int *, void *);
+extern const struct file_operations jfs_txstats_proc_fops;
-extern int jfs_mpstat_read(char *, char **, off_t, int, int *, void *);
+extern const struct file_operations jfs_mpstat_proc_fops;
-extern int jfs_xtstat_read(char *, char **, off_t, int, int *, void *);
+extern const struct file_operations jfs_xtstat_proc_fops;
 #define INCREMENT(x)            ((x)++)
 #define DECREMENT(x)            ((x)--)
diff --git a/fs/jfs/jfs_dtree.h b/fs/jfs/jfs_dtree.h
index cdac2d5bafeb..2545bb317235 100644
--- a/fs/jfs/jfs_dtree.h
+++ b/fs/jfs/jfs_dtree.h
@@ -243,9 +243,6 @@ typedef union {
 #define JFS_REMOVE 3
 #define JFS_RENAME 4
-#define DIRENTSIZ(namlen) \
-    ( (sizeof(struct dirent) - 2*(JFS_NAME_MAX+1) + 2*((namlen)+1) + 3) &~ 3 )
 /*
 * Maximum file offset for directories.
 */
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 734ec916beaf..d6363d8309d0 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -1520,7 +1520,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip)
                                        jfs_error(ip->i_sb,
                                                  "diAlloc: can't find free bit "
                                                  "in wmap");
-                                        return EIO;
+                                        return -EIO;
                                }
                                /* determine the inode number within the
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 325a9679b95a..cd2ec2988b59 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -69,6 +69,7 @@
 #include <linux/freezer.h>
 #include <linux/delay.h>
 #include <linux/mutex.h>
+#include <linux/seq_file.h>
 #include "jfs_incore.h"
 #include "jfs_filsys.h"
 #include "jfs_metapage.h"
@@ -2503,13 +2504,9 @@ exit:
 }
 #ifdef CONFIG_JFS_STATISTICS
-int jfs_lmstats_read(char *buffer, char **start, off_t offset, int length,
+static int jfs_lmstats_proc_show(struct seq_file *m, void *v)
-                      int *eof, void *data)
 {
-        int len = 0;
+        seq_printf(m,
-        off_t begin;
-        len += sprintf(buffer,
                       "JFS Logmgr stats\n"
                       "================\n"
                       "commits = %d\n"
@@ -2522,19 +2519,19 @@ int jfs_lmstats_read(char *buffer, char **start, off_t offset, int length,
                       lmStat.pagedone,
                       lmStat.full_page,
                       lmStat.partial_page);
+        return 0;
+}
-        begin = offset;
+static int jfs_lmstats_proc_open(struct inode *inode, struct file *file)
-        *start = buffer + begin;
+{
-        len -= begin;
+        return single_open(file, jfs_lmstats_proc_show, NULL);
-        if (len > length)
-                len = length;
-        else
-                *eof = 1;
-        if (len < 0)
-                len = 0;
-        return len;
 }
+const struct file_operations jfs_lmstats_proc_fops = {
+        .owner          = THIS_MODULE,
+        .open           = jfs_lmstats_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
 #endif /* CONFIG_JFS_STATISTICS */
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index d1e64f2f2fcd..854ff0ec574f 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -19,10 +19,12 @@
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/module.h>
 #include <linux/bio.h>
 #include <linux/init.h>
 #include <linux/buffer_head.h>
 #include <linux/mempool.h>
+#include <linux/seq_file.h>
 #include "jfs_incore.h"
 #include "jfs_superblock.h"
 #include "jfs_filsys.h"
@@ -804,13 +806,9 @@ void __invalidate_metapages(struct inode *ip, s64 addr, int len)
 }
 #ifdef CONFIG_JFS_STATISTICS
-int jfs_mpstat_read(char *buffer, char **start, off_t offset, int length,
+static int jfs_mpstat_proc_show(struct seq_file *m, void *v)
-                    int *eof, void *data)
 {
-        int len = 0;
+        seq_printf(m,
-        off_t begin;
-        len += sprintf(buffer,
                       "JFS Metapage statistics\n"
                       "=======================\n"
                       "page allocations = %d\n"
@@ -819,19 +817,19 @@ int jfs_mpstat_read(char *buffer, char **start, off_t offset, int length,
                       mpStat.pagealloc,
                       mpStat.pagefree,
                       mpStat.lockwait);
+        return 0;
+}
-        begin = offset;
+static int jfs_mpstat_proc_open(struct inode *inode, struct file *file)
-        *start = buffer + begin;
+{
-        len -= begin;
+        return single_open(file, jfs_mpstat_proc_show, NULL);
-        if (len > length)
-                len = length;
-        else
-                *eof = 1;
-        if (len < 0)
-                len = 0;
-        return len;
 }
+const struct file_operations jfs_mpstat_proc_fops = {
+        .owner          = THIS_MODULE,
+        .open           = jfs_mpstat_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
 #endif
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index e7c60ae6b5b2..f26e4d03ada5 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -49,6 +49,7 @@
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/kthread.h>
+#include <linux/seq_file.h>
 #include "jfs_incore.h"
 #include "jfs_inode.h"
 #include "jfs_filsys.h"
@@ -3009,11 +3010,8 @@ int jfs_sync(void *arg)
 }
 #if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_DEBUG)
-int jfs_txanchor_read(char *buffer, char **start, off_t offset, int length,
+static int jfs_txanchor_proc_show(struct seq_file *m, void *v)
-                      int *eof, void *data)
 {
-        int len = 0;
-        off_t begin;
        char *freewait;
        char *freelockwait;
        char *lowlockwait;
@@ -3025,7 +3023,7 @@ int jfs_txanchor_read(char *buffer, char **start, off_t offset, int length,
        lowlockwait =
            waitqueue_active(&TxAnchor.lowlockwait) ? "active" : "empty";
-        len += sprintf(buffer,
+        seq_printf(m,
                       "JFS TxAnchor\n"
                       "============\n"
                       "freetid = %d\n"
@@ -3044,31 +3042,27 @@ int jfs_txanchor_read(char *buffer, char **start, off_t offset, int length,
                       TxAnchor.tlocksInUse,
                       jfs_tlocks_low,
                       list_empty(&TxAnchor.unlock_queue) ? "" : "not ");
+        return 0;
+}
-        begin = offset;
+static int jfs_txanchor_proc_open(struct inode *inode, struct file *file)
-        *start = buffer + begin;
+{
-        len -= begin;
+        return single_open(file, jfs_txanchor_proc_show, NULL);
-        if (len > length)
-                len = length;
-        else
-                *eof = 1;
-        if (len < 0)
-                len = 0;
-        return len;
 }
+const struct file_operations jfs_txanchor_proc_fops = {
+        .owner          = THIS_MODULE,
+        .open           = jfs_txanchor_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
 #endif
 #if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_STATISTICS)
-int jfs_txstats_read(char *buffer, char **start, off_t offset, int length,
+static int jfs_txstats_proc_show(struct seq_file *m, void *v)
-                     int *eof, void *data)
 {
-        int len = 0;
+        seq_printf(m,
-        off_t begin;
-        len += sprintf(buffer,
                       "JFS TxStats\n"
                       "===========\n"
                       "calls to txBegin = %d\n"
@@ -3089,19 +3083,19 @@ int jfs_txstats_read(char *buffer, char **start, off_t offset, int length,
                       TxStat.txBeginAnon_lockslow,
                       TxStat.txLockAlloc,
                       TxStat.txLockAlloc_freelock);
+        return 0;
+}
-        begin = offset;
+static int jfs_txstats_proc_open(struct inode *inode, struct file *file)
-        *start = buffer + begin;
+{
-        len -= begin;
+        return single_open(file, jfs_txstats_proc_show, NULL);
-        if (len > length)
-                len = length;
-        else
-                *eof = 1;
-        if (len < 0)
-                len = 0;
-        return len;
 }
+const struct file_operations jfs_txstats_proc_fops = {
+        .owner          = THIS_MODULE,
+        .open           = jfs_txstats_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
 #endif
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index 5a61ebf2cbcc..ae3acafb447b 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -20,7 +20,9 @@
 */
 #include <linux/fs.h>
+#include <linux/module.h>
 #include <linux/quotaops.h>
+#include <linux/seq_file.h>
 #include "jfs_incore.h"
 #include "jfs_filsys.h"
 #include "jfs_metapage.h"
@@ -4134,13 +4136,9 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
 }
 #ifdef CONFIG_JFS_STATISTICS
-int jfs_xtstat_read(char *buffer, char **start, off_t offset, int length,
+static int jfs_xtstat_proc_show(struct seq_file *m, void *v)
-                    int *eof, void *data)
 {
-        int len = 0;
+        seq_printf(m,
-        off_t begin;
-        len += sprintf(buffer,
                       "JFS Xtree statistics\n"
                       "====================\n"
                       "searches = %d\n"
@@ -4149,19 +4147,19 @@ int jfs_xtstat_read(char *buffer, char **start, off_t offset, int length,
                       xtStat.search,
                       xtStat.fastSearch,
                       xtStat.split);
+        return 0;
+}
-        begin = offset;
+static int jfs_xtstat_proc_open(struct inode *inode, struct file *file)
-        *start = buffer + begin;
+{
-        len -= begin;
+        return single_open(file, jfs_xtstat_proc_show, NULL);
-        if (len > length)
-                len = length;
-        else
-                *eof = 1;
-        if (len < 0)
-                len = 0;
-        return len;
 }
+const struct file_operations jfs_xtstat_proc_fops = {
+        .owner          = THIS_MODULE,
+        .open           = jfs_xtstat_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
 #endif
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 0ba6778edaa2..2aba82386810 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1455,7 +1455,7 @@ static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struc
                free_UCSname(&key);
                if (rc == -ENOENT) {
                        d_add(dentry, NULL);
-                        return ERR_PTR(0);
+                        return NULL;
                } else if (rc) {
                        jfs_err("jfs_lookup: dtSearch returned %d", rc);
                        return ERR_PTR(rc);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 50ea65451732..0288e6d7936a 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -499,7 +499,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
        inode = jfs_iget(sb, ROOT_I);
        if (IS_ERR(inode)) {
                ret = PTR_ERR(inode);
-                goto out_no_root;
+                goto out_no_rw;
        }
        sb->s_root = d_alloc_root(inode);
        if (!sb->s_root)
@@ -521,9 +521,8 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
        return 0;
 out_no_root:
-        jfs_err("jfs_read_super: get root inode failed");
+        jfs_err("jfs_read_super: get root dentry failed");
-        if (inode)
+        iput(inode);
-                iput(inode);
 out_no_rw:
        rc = jfs_umount(sb);
diff --git a/fs/libfs.c b/fs/libfs.c
index 892d41cb3382..baeb71ee1cde 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -512,6 +512,20 @@ void simple_release_fs(struct vfsmount **mount, int *count)
        mntput(mnt);
 }
+/**
+ * simple_read_from_buffer - copy data from the buffer to user space
+ * @to: the user space buffer to read to
+ * @count: the maximum number of bytes to read
+ * @ppos: the current position in the buffer
+ * @from: the buffer to read from
+ * @available: the size of the buffer
+ *
+ * The simple_read_from_buffer() function reads up to @count bytes from the
+ * buffer @from at offset @ppos into the user space address starting at @to.
+ *
+ * On success, the number of bytes read is returned and the offset @ppos is
+ * advanced by this number, or negative value is returned on error.
+ **/
 ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
                                const void *from, size_t available)
 {
@@ -528,6 +542,20 @@ ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
        return count;
 }
+/**
+ * memory_read_from_buffer - copy data from the buffer
+ * @to: the kernel space buffer to read to
+ * @count: the maximum number of bytes to read
+ * @ppos: the current position in the buffer
+ * @from: the buffer to read from
+ * @available: the size of the buffer
+ *
+ * The memory_read_from_buffer() function reads up to @count bytes from the
+ * buffer @from at offset @ppos into the kernel space address starting at @to.
+ *
+ * On success, the number of bytes read is returned and the offset @ppos is
+ * advanced by this number, or negative value is returned on error.
+ **/
 ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
                                const void *from, size_t available)
 {
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 5df517b81f3f..1f6dc518505c 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -224,7 +224,9 @@ void nlm_release_call(struct nlm_rqst *call)
 static void nlmclnt_rpc_release(void *data)
 {
+        lock_kernel();
        nlm_release_call(data);
+        unlock_kernel();
 }
 static int nlm_wait_on_grace(wait_queue_head_t *queue)
@@ -430,7 +432,7 @@ nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl)
                         * Report the conflicting lock back to the application.
                         */
                        fl->fl_start = req->a_res.lock.fl.fl_start;
-                        fl->fl_end = req->a_res.lock.fl.fl_start;
+                        fl->fl_end = req->a_res.lock.fl.fl_end;
                        fl->fl_type = req->a_res.lock.fl.fl_type;
                        fl->fl_pid = 0;
                        break;
@@ -710,7 +712,9 @@ static void nlmclnt_unlock_callback(struct rpc_task *task, void *data)
 die:
        return;
 retry_rebind:
+        lock_kernel();
        nlm_rebind_host(req->a_host);
+        unlock_kernel();
 retry_unlock:
        rpc_restart_call(task);
 }
@@ -788,7 +792,9 @@ retry_cancel:
        /* Don't ever retry more than 3 times */
        if (req->a_retries++ >= NLMCLNT_MAX_RETRIES)
                goto die;
+        lock_kernel();
        nlm_rebind_host(req->a_host);
+        unlock_kernel();
        rpc_restart_call(task);
        rpc_delay(task, 30 * HZ);
 }
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 385437e3387d..2e27176ff42f 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -248,7 +248,9 @@ static void nlm4svc_callback_exit(struct rpc_task *task, void *data)
 static void nlm4svc_callback_release(void *data)
 {
+        lock_kernel();
        nlm_release_call(data);
+        unlock_kernel();
 }
 static const struct rpc_call_ops nlm4svc_callback_ops = {
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 81aca859bfde..56a08ab9a4cb 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -795,6 +795,7 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
        dprintk("lockd: GRANT_MSG RPC callback\n");
+        lock_kernel();
        /* if the block is not on a list at this point then it has
         * been invalidated. Don't try to requeue it.
         *
@@ -804,7 +805,7 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
         * for nlm_blocked?
         */
        if (list_empty(&block->b_list))
-                return;
+                goto out;
        /* Technically, we should down the file semaphore here. Since we
         * move the block towards the head of the queue only, no harm
@@ -818,13 +819,17 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
        }
        nlmsvc_insert_block(block, timeout);
        svc_wake_up(block->b_daemon);
+out:
+        unlock_kernel();
 }
 static void nlmsvc_grant_release(void *data)
 {
        struct nlm_rqst         *call = data;
+        lock_kernel();
        nlmsvc_release_block(call->a_block);
+        unlock_kernel();
 }
 static const struct rpc_call_ops nlmsvc_grant_ops = {
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 88379cc6e0b1..ce6952b50a75 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -278,7 +278,9 @@ static void nlmsvc_callback_exit(struct rpc_task *task, void *data)
 static void nlmsvc_callback_release(void *data)
 {
+        lock_kernel();
        nlm_release_call(data);
+        unlock_kernel();
 }
 static const struct rpc_call_ops nlmsvc_callback_ops = {
diff --git a/fs/locks.c b/fs/locks.c
index 11dbf08651b7..dce8c747371c 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -561,9 +561,6 @@ static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl)
        /* insert into file's list */
        fl->fl_next = *pos;
        *pos = fl;
-        if (fl->fl_ops && fl->fl_ops->fl_insert)
-                fl->fl_ops->fl_insert(fl);
 }
 /*
@@ -586,9 +583,6 @@ static void locks_delete_lock(struct file_lock **thisfl_p)
                fl->fl_fasync = NULL;
        }
-        if (fl->fl_ops && fl->fl_ops->fl_remove)
-                fl->fl_ops->fl_remove(fl);
        if (fl->fl_nspid) {
                put_pid(fl->fl_nspid);
                fl->fl_nspid = NULL;
diff --git a/fs/mpage.c b/fs/mpage.c
index 235e4d3873a8..dbcc7af76a15 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -82,7 +82,7 @@ static void mpage_end_io_write(struct bio *bio, int err)
        bio_put(bio);
 }
-static struct bio *mpage_bio_submit(int rw, struct bio *bio)
+struct bio *mpage_bio_submit(int rw, struct bio *bio)
 {
        bio->bi_end_io = mpage_end_io_read;
        if (rw == WRITE)
@@ -90,6 +90,7 @@ static struct bio *mpage_bio_submit(int rw, struct bio *bio)
        submit_bio(rw, bio);
        return NULL;
 }
+EXPORT_SYMBOL(mpage_bio_submit);
 static struct bio *
 mpage_alloc(struct block_device *bdev,
@@ -435,15 +436,9 @@ EXPORT_SYMBOL(mpage_readpage);
 * written, so it can intelligently allocate a suitably-sized BIO.  For now,
 * just allocate full-size (16-page) BIOs.
 */
-struct mpage_data {
-        struct bio *bio;
-        sector_t last_block_in_bio;
-        get_block_t *get_block;
-        unsigned use_writepage;
-};
-static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
+int __mpage_writepage(struct page *page, struct writeback_control *wbc,
-                             void *data)
+                      void *data)
 {
        struct mpage_data *mpd = data;
        struct bio *bio = mpd->bio;
@@ -651,6 +646,7 @@ out:
        mpd->bio = bio;
        return ret;
 }
+EXPORT_SYMBOL(__mpage_writepage);
 /**
 * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them
diff --git a/fs/msdos/namei.c b/fs/msdos/namei.c
index 05ff4f1d7026..1f7f2956412a 100644
--- a/fs/msdos/namei.c
+++ b/fs/msdos/namei.c
@@ -214,7 +214,7 @@ static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry,
        dentry->d_op = &msdos_dentry_operations;
-        lock_kernel();
+        lock_super(sb);
        res = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
        if (res == -ENOENT)
                goto add;
@@ -232,7 +232,7 @@ add:
        if (dentry)
                dentry->d_op = &msdos_dentry_operations;
 out:
-        unlock_kernel();
+        unlock_super(sb);
        if (!res)
                return dentry;
        return ERR_PTR(res);
@@ -286,7 +286,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, int mode,
        unsigned char msdos_name[MSDOS_NAME];
        int err, is_hid;
-        lock_kernel();
+        lock_super(sb);
        err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
                                msdos_name, &MSDOS_SB(sb)->options);
@@ -315,7 +315,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, int mode,
        d_instantiate(dentry, inode);
 out:
-        unlock_kernel();
+        unlock_super(sb);
        if (!err)
                err = fat_flush_inodes(sb, dir, inode);
        return err;
@@ -324,11 +324,12 @@ out:
 /***** Remove a directory */
 static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
 {
+        struct super_block *sb = dir->i_sb;
        struct inode *inode = dentry->d_inode;
        struct fat_slot_info sinfo;
        int err;
-        lock_kernel();
+        lock_super(sb);
        /*
         * Check whether the directory is not in use, then check
         * whether it is empty.
@@ -349,9 +350,9 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
        inode->i_ctime = CURRENT_TIME_SEC;
        fat_detach(inode);
 out:
-        unlock_kernel();
+        unlock_super(sb);
        if (!err)
-                err = fat_flush_inodes(inode->i_sb, dir, inode);
+                err = fat_flush_inodes(sb, dir, inode);
        return err;
 }
@@ -366,7 +367,7 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        struct timespec ts;
        int err, is_hid, cluster;
-        lock_kernel();
+        lock_super(sb);
        err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
                                msdos_name, &MSDOS_SB(sb)->options);
@@ -404,14 +405,14 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        d_instantiate(dentry, inode);
-        unlock_kernel();
+        unlock_super(sb);
        fat_flush_inodes(sb, dir, inode);
        return 0;
 out_free:
        fat_free_clusters(dir, cluster);
 out:
-        unlock_kernel();
+        unlock_super(sb);
        return err;
 }
@@ -419,10 +420,11 @@ out:
 static int msdos_unlink(struct inode *dir, struct dentry *dentry)
 {
        struct inode *inode = dentry->d_inode;
+        struct super_block *sb= inode->i_sb;
        struct fat_slot_info sinfo;
        int err;
-        lock_kernel();
+        lock_super(sb);
        err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
        if (err)
                goto out;
@@ -434,9 +436,9 @@ static int msdos_unlink(struct inode *dir, struct dentry *dentry)
        inode->i_ctime = CURRENT_TIME_SEC;
        fat_detach(inode);
 out:
-        unlock_kernel();
+        unlock_super(sb);
        if (!err)
-                err = fat_flush_inodes(inode->i_sb, dir, inode);
+                err = fat_flush_inodes(sb, dir, inode);
        return err;
 }
@@ -618,10 +620,11 @@ error_inode:
 static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry,
                        struct inode *new_dir, struct dentry *new_dentry)
 {
+        struct super_block *sb = old_dir->i_sb;
        unsigned char old_msdos_name[MSDOS_NAME], new_msdos_name[MSDOS_NAME];
        int err, is_hid;
-        lock_kernel();
+        lock_super(sb);
        err = msdos_format_name(old_dentry->d_name.name,
                                old_dentry->d_name.len, old_msdos_name,
@@ -640,9 +643,9 @@ static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry,
        err = do_msdos_rename(old_dir, old_msdos_name, old_dentry,
                              new_dir, new_msdos_name, new_dentry, is_hid);
 out:
-        unlock_kernel();
+        unlock_super(sb);
        if (!err)
-                err = fat_flush_inodes(old_dir->i_sb, old_dir, new_dir);
+                err = fat_flush_inodes(sb, old_dir, new_dir);
        return err;
 }
diff --git a/fs/namei.c b/fs/namei.c
index c7e43536c49a..01e67dddcc3d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -581,15 +581,13 @@ static __always_inline int link_path_walk(const char *name, struct nameidata *nd
        int result;
        /* make sure the stuff we saved doesn't go away */
-        dget(save.dentry);
+        path_get(&save);
-        mntget(save.mnt);
        result = __link_path_walk(name, nd);
        if (result == -ESTALE) {
                /* nd->path had been dropped */
                nd->path = save;
-                dget(nd->path.dentry);
+                path_get(&nd->path);
-                mntget(nd->path.mnt);
                nd->flags |= LOOKUP_REVAL;
                result = __link_path_walk(name, nd);
        }
@@ -1216,8 +1214,9 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
        nd->flags = flags;
        nd->depth = 0;
-        nd->path.mnt = mntget(mnt);
+        nd->path.dentry = dentry;
-        nd->path.dentry = dget(dentry);
+        nd->path.mnt = mnt;
+        path_get(&nd->path);
        retval = path_walk(name, nd);
        if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
@@ -2857,16 +2856,17 @@ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
 {
        struct nameidata nd;
        void *cookie;
+        int res;
        nd.depth = 0;
        cookie = dentry->d_inode->i_op->follow_link(dentry, &nd);
-        if (!IS_ERR(cookie)) {
+        if (IS_ERR(cookie))
-                int res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd));
+                return PTR_ERR(cookie);
-                if (dentry->d_inode->i_op->put_link)
-                        dentry->d_inode->i_op->put_link(dentry, &nd, cookie);
+        res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd));
-                cookie = ERR_PTR(res);
+        if (dentry->d_inode->i_op->put_link)
-        }
+                dentry->d_inode->i_op->put_link(dentry, &nd, cookie);
-        return PTR_ERR(cookie);
+        return res;
 }
 int vfs_follow_link(struct nameidata *nd, const char *link)
diff --git a/fs/namespace.c b/fs/namespace.c
index 4fc302c2a0e0..4f6f7635b59c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -750,7 +750,7 @@ struct proc_fs_info {
        const char *str;
 };
-static void show_sb_opts(struct seq_file *m, struct super_block *sb)
+static int show_sb_opts(struct seq_file *m, struct super_block *sb)
 {
        static const struct proc_fs_info fs_info[] = {
                { MS_SYNCHRONOUS, ",sync" },
@@ -764,6 +764,8 @@ static void show_sb_opts(struct seq_file *m, struct super_block *sb)
                if (sb->s_flags & fs_infop->flag)
                        seq_puts(m, fs_infop->str);
        }
+        return security_sb_show_options(m, sb);
 }
 static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
@@ -806,11 +808,14 @@ static int show_vfsmnt(struct seq_file *m, void *v)
        seq_putc(m, ' ');
        show_type(m, mnt->mnt_sb);
        seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw");
-        show_sb_opts(m, mnt->mnt_sb);
+        err = show_sb_opts(m, mnt->mnt_sb);
+        if (err)
+                goto out;
        show_mnt_opts(m, mnt);
        if (mnt->mnt_sb->s_op->show_options)
                err = mnt->mnt_sb->s_op->show_options(m, mnt);
        seq_puts(m, " 0 0\n");
+out:
        return err;
 }
@@ -865,10 +870,13 @@ static int show_mountinfo(struct seq_file *m, void *v)
        seq_putc(m, ' ');
        mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
        seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw");
-        show_sb_opts(m, sb);
+        err = show_sb_opts(m, sb);
+        if (err)
+                goto out;
        if (sb->s_op->show_options)
                err = sb->s_op->show_options(m, mnt);
        seq_putc(m, '\n');
+out:
        return err;
 }
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 2b145de45b39..6a7d901f1936 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -18,6 +18,7 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/sched.h>
+#include <linux/smp_lock.h>
 #include <linux/ncp_fs.h>
 #include "ncplib_kernel.h"
@@ -281,9 +282,18 @@ static int ncp_release(struct inode *inode, struct file *file) {
        return 0;
 }
+static loff_t ncp_remote_llseek(struct file *file, loff_t offset, int origin)
+{
+        loff_t ret;
+        lock_kernel();
+        ret = generic_file_llseek_unlocked(file, offset, origin);
+        unlock_kernel();
+        return ret;
+}
 const struct file_operations ncp_file_operations =
 {
-        .llseek         = remote_llseek,
+        .llseek         = ncp_remote_llseek,
        .read           = ncp_file_read,
        .write          = ncp_file_write,
        .ioctl          = ncp_ioctl,
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index c1e7c8300629..f447f4b4476c 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -27,7 +27,7 @@
 struct nfs_callback_data {
        unsigned int users;
-        struct svc_serv *serv;
+        struct svc_rqst *rqst;
        struct task_struct *task;
 };
@@ -91,21 +91,17 @@ nfs_callback_svc(void *vrqstp)
                svc_process(rqstp);
        }
        unlock_kernel();
-        nfs_callback_info.task = NULL;
-        svc_exit_thread(rqstp);
        return 0;
 }
 /*
- * Bring up the server process if it is not already up.
+ * Bring up the callback thread if it is not already up.
 */
 int nfs_callback_up(void)
 {
        struct svc_serv *serv = NULL;
-        struct svc_rqst *rqstp;
        int ret = 0;
-        lock_kernel();
        mutex_lock(&nfs_callback_mutex);
        if (nfs_callback_info.users++ || nfs_callback_info.task != NULL)
                goto out;
@@ -121,22 +117,23 @@ int nfs_callback_up(void)
        nfs_callback_tcpport = ret;
        dprintk("Callback port = 0x%x\n", nfs_callback_tcpport);
-        rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]);
+        nfs_callback_info.rqst = svc_prepare_thread(serv, &serv->sv_pools[0]);
-        if (IS_ERR(rqstp)) {
+        if (IS_ERR(nfs_callback_info.rqst)) {
-                ret = PTR_ERR(rqstp);
+                ret = PTR_ERR(nfs_callback_info.rqst);
+                nfs_callback_info.rqst = NULL;
                goto out_err;
        }
        svc_sock_update_bufs(serv);
-        nfs_callback_info.serv = serv;
-        nfs_callback_info.task = kthread_run(nfs_callback_svc, rqstp,
+        nfs_callback_info.task = kthread_run(nfs_callback_svc,
+                                             nfs_callback_info.rqst,
                                             "nfsv4-svc");
        if (IS_ERR(nfs_callback_info.task)) {
                ret = PTR_ERR(nfs_callback_info.task);
-                nfs_callback_info.serv = NULL;
+                svc_exit_thread(nfs_callback_info.rqst);
+                nfs_callback_info.rqst = NULL;
                nfs_callback_info.task = NULL;
-                svc_exit_thread(rqstp);
                goto out_err;
        }
 out:
@@ -149,7 +146,6 @@ out:
        if (serv)
                svc_destroy(serv);
        mutex_unlock(&nfs_callback_mutex);
-        unlock_kernel();
        return ret;
 out_err:
        dprintk("Couldn't create callback socket or server thread; err = %d\n",
@@ -159,17 +155,19 @@ out_err:
 }
 /*
- * Kill the server process if it is not already down.
+ * Kill the callback thread if it's no longer being used.
 */
 void nfs_callback_down(void)
 {
-        lock_kernel();
        mutex_lock(&nfs_callback_mutex);
        nfs_callback_info.users--;
-        if (nfs_callback_info.users == 0 && nfs_callback_info.task != NULL)
+        if (nfs_callback_info.users == 0 && nfs_callback_info.task != NULL) {
                kthread_stop(nfs_callback_info.task);
+                svc_exit_thread(nfs_callback_info.rqst);
+                nfs_callback_info.rqst = NULL;
+                nfs_callback_info.task = NULL;
+        }
        mutex_unlock(&nfs_callback_mutex);
-        unlock_kernel();
 }
 static int nfs_callback_authenticate(struct svc_rqst *rqstp)
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index f2a092ca69b5..5ee23e7058b3 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -431,14 +431,14 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
 {
        to->to_initval = timeo * HZ / 10;
        to->to_retries = retrans;
-        if (!to->to_retries)
-                to->to_retries = 2;
        switch (proto) {
        case XPRT_TRANSPORT_TCP:
        case XPRT_TRANSPORT_RDMA:
+                if (to->to_retries == 0)
+                        to->to_retries = NFS_DEF_TCP_RETRANS;
                if (to->to_initval == 0)
-                        to->to_initval = 60 * HZ;
+                        to->to_initval = NFS_DEF_TCP_TIMEO * HZ / 10;
                if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
                        to->to_initval = NFS_MAX_TCP_TIMEOUT;
                to->to_increment = to->to_initval;
@@ -450,14 +450,17 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
                to->to_exponential = 0;
                break;
        case XPRT_TRANSPORT_UDP:
-        default:
+                if (to->to_retries == 0)
+                        to->to_retries = NFS_DEF_UDP_RETRANS;
                if (!to->to_initval)
-                        to->to_initval = 11 * HZ / 10;
+                        to->to_initval = NFS_DEF_UDP_TIMEO * HZ / 10;
                if (to->to_initval > NFS_MAX_UDP_TIMEOUT)
                        to->to_initval = NFS_MAX_UDP_TIMEOUT;
                to->to_maxval = NFS_MAX_UDP_TIMEOUT;
                to->to_exponential = 1;
                break;
+        default:
+                BUG();
        }
 }
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 58d43daec084..28a238dab23a 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -133,13 +133,14 @@ nfs_opendir(struct inode *inode, struct file *filp)
 {
        int res;
-        dfprintk(VFS, "NFS: opendir(%s/%ld)\n",
+        dfprintk(FILE, "NFS: open dir(%s/%s)\n",
-                        inode->i_sb->s_id, inode->i_ino);
+                        filp->f_path.dentry->d_parent->d_name.name,
+                        filp->f_path.dentry->d_name.name);
+        nfs_inc_stats(inode, NFSIOS_VFSOPEN);
-        lock_kernel();
        /* Call generic open code in order to cache credentials */
        res = nfs_open(inode, filp);
-        unlock_kernel();
        return res;
 }
@@ -204,7 +205,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
         * Note: assumes we have exclusive access to this mapping either
         *       through inode->i_mutex or some other mechanism.
         */
-        if (page->index == 0 && invalidate_inode_pages2_range(inode->i_mapping, PAGE_CACHE_SIZE, -1) < 0) {
+        if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) {
                /* Should never happen */
                nfs_zap_mapping(inode, inode->i_mapping);
        }
@@ -528,13 +529,11 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        struct nfs_fattr fattr;
        long            res;
-        dfprintk(VFS, "NFS: readdir(%s/%s) starting at cookie %Lu\n",
+        dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
                        dentry->d_parent->d_name.name, dentry->d_name.name,
                        (long long)filp->f_pos);
        nfs_inc_stats(inode, NFSIOS_VFSGETDENTS);
-        lock_kernel();
        /*
         * filp->f_pos points to the dirent entry number.
         * *desc->dir_cookie has the cookie for the next entry. We have
@@ -592,10 +591,9 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        }
 out:
        nfs_unblock_sillyrename(dentry);
-        unlock_kernel();
        if (res > 0)
                res = 0;
-        dfprintk(VFS, "NFS: readdir(%s/%s) returns %ld\n",
+        dfprintk(FILE, "NFS: readdir(%s/%s) returns %ld\n",
                        dentry->d_parent->d_name.name, dentry->d_name.name,
                        res);
        return res;
@@ -603,7 +601,15 @@ out:
 static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin)
 {
-        mutex_lock(&filp->f_path.dentry->d_inode->i_mutex);
+        struct dentry *dentry = filp->f_path.dentry;
+        struct inode *inode = dentry->d_inode;
+        dfprintk(FILE, "NFS: llseek dir(%s/%s, %lld, %d)\n",
+                        dentry->d_parent->d_name.name,
+                        dentry->d_name.name,
+                        offset, origin);
+        mutex_lock(&inode->i_mutex);
        switch (origin) {
                case 1:
                        offset += filp->f_pos;
@@ -619,7 +625,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin)
                nfs_file_open_context(filp)->dir_cookie = 0;
        }
 out:
-        mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex);
+        mutex_unlock(&inode->i_mutex);
        return offset;
 }
@@ -629,10 +635,11 @@ out:
 */
 static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync)
 {
-        dfprintk(VFS, "NFS: fsync_dir(%s/%s) datasync %d\n",
+        dfprintk(FILE, "NFS: fsync dir(%s/%s) datasync %d\n",
                        dentry->d_parent->d_name.name, dentry->d_name.name,
                        datasync);
+        nfs_inc_stats(dentry->d_inode, NFSIOS_VFSFSYNC);
        return 0;
 }
@@ -767,7 +774,6 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
        struct nfs_fattr fattr;
        parent = dget_parent(dentry);
-        lock_kernel();
        dir = parent->d_inode;
        nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
        inode = dentry->d_inode;
@@ -805,7 +811,6 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
        nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 out_valid:
-        unlock_kernel();
        dput(parent);
        dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is valid\n",
                        __func__, dentry->d_parent->d_name.name,
@@ -824,7 +829,6 @@ out_zap_parent:
                shrink_dcache_parent(dentry);
        }
        d_drop(dentry);
-        unlock_kernel();
        dput(parent);
        dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n",
                        __func__, dentry->d_parent->d_name.name,
@@ -858,6 +862,14 @@ static int nfs_dentry_delete(struct dentry *dentry)
 }
+static void nfs_drop_nlink(struct inode *inode)
+{
+        spin_lock(&inode->i_lock);
+        if (inode->i_nlink > 0)
+                drop_nlink(inode);
+        spin_unlock(&inode->i_lock);
+}
 /*
 * Called when the dentry loses inode.
 * We use it to clean up silly-renamed files.
@@ -869,10 +881,8 @@ static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
                NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA;
        if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
-                lock_kernel();
                drop_nlink(inode);
                nfs_complete_unlink(dentry, inode);
-                unlock_kernel();
        }
        iput(inode);
 }
@@ -903,8 +913,6 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
        res = ERR_PTR(-ENOMEM);
        dentry->d_op = NFS_PROTO(dir)->dentry_ops;
-        lock_kernel();
        /*
         * If we're doing an exclusive create, optimize away the lookup
         * but don't hash the dentry.
@@ -912,7 +920,7 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
        if (nfs_is_exclusive_create(dir, nd)) {
                d_instantiate(dentry, NULL);
                res = NULL;
-                goto out_unlock;
+                goto out;
        }
        parent = dentry->d_parent;
@@ -940,8 +948,6 @@ no_entry:
        nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 out_unblock_sillyrename:
        nfs_unblock_sillyrename(parent);
-out_unlock:
-        unlock_kernel();
 out:
        return res;
 }
@@ -999,9 +1005,7 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
        }
        /* Open the file on the server */
-        lock_kernel();
        res = nfs4_atomic_open(dir, dentry, nd);
-        unlock_kernel();
        if (IS_ERR(res)) {
                error = PTR_ERR(res);
                switch (error) {
@@ -1063,9 +1067,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
         * operations that change the directory. We therefore save the
         * change attribute *before* we do the RPC call.
         */
-        lock_kernel();
        ret = nfs4_open_revalidate(dir, dentry, openflags, nd);
-        unlock_kernel();
 out:
        dput(parent);
        if (!ret)
@@ -1218,14 +1220,11 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
        if ((nd->flags & LOOKUP_CREATE) != 0)
                open_flags = nd->intent.open.flags;
-        lock_kernel();
        error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, nd);
        if (error != 0)
                goto out_err;
-        unlock_kernel();
        return 0;
 out_err:
-        unlock_kernel();
        d_drop(dentry);
        return error;
 }
@@ -1248,14 +1247,11 @@ nfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
        attr.ia_mode = mode;
        attr.ia_valid = ATTR_MODE;
-        lock_kernel();
        status = NFS_PROTO(dir)->mknod(dir, dentry, &attr, rdev);
        if (status != 0)
                goto out_err;
-        unlock_kernel();
        return 0;
 out_err:
-        unlock_kernel();
        d_drop(dentry);
        return status;
 }
@@ -1274,15 +1270,12 @@ static int nfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        attr.ia_valid = ATTR_MODE;
        attr.ia_mode = mode | S_IFDIR;
-        lock_kernel();
        error = NFS_PROTO(dir)->mkdir(dir, dentry, &attr);
        if (error != 0)
                goto out_err;
-        unlock_kernel();
        return 0;
 out_err:
        d_drop(dentry);
-        unlock_kernel();
        return error;
 }
@@ -1299,14 +1292,12 @@ static int nfs_rmdir(struct inode *dir, struct dentry *dentry)
        dfprintk(VFS, "NFS: rmdir(%s/%ld), %s\n",
                        dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
-        lock_kernel();
        error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
        /* Ensure the VFS deletes this inode */
        if (error == 0 && dentry->d_inode != NULL)
                clear_nlink(dentry->d_inode);
        else if (error == -ENOENT)
                nfs_dentry_handle_enoent(dentry);
-        unlock_kernel();
        return error;
 }
@@ -1408,7 +1399,7 @@ static int nfs_safe_remove(struct dentry *dentry)
                error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
                /* The VFS may want to delete this inode */
                if (error == 0)
-                        drop_nlink(inode);
+                        nfs_drop_nlink(inode);
                nfs_mark_for_revalidate(inode);
        } else
                error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
@@ -1431,7 +1422,6 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
        dfprintk(VFS, "NFS: unlink(%s/%ld, %s)\n", dir->i_sb->s_id,
                dir->i_ino, dentry->d_name.name);
-        lock_kernel();
        spin_lock(&dcache_lock);
        spin_lock(&dentry->d_lock);
        if (atomic_read(&dentry->d_count) > 1) {
@@ -1440,7 +1430,6 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
                /* Start asynchronous writeout of the inode */
                write_inode_now(dentry->d_inode, 0);
                error = nfs_sillyrename(dir, dentry);
-                unlock_kernel();
                return error;
        }
        if (!d_unhashed(dentry)) {
@@ -1454,7 +1443,6 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
                nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
        } else if (need_rehash)
                d_rehash(dentry);
-        unlock_kernel();
        return error;
 }
@@ -1491,13 +1479,9 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym
        attr.ia_mode = S_IFLNK | S_IRWXUGO;
        attr.ia_valid = ATTR_MODE;
-        lock_kernel();
        page = alloc_page(GFP_HIGHUSER);
-        if (!page) {
+        if (!page)
-                unlock_kernel();
                return -ENOMEM;
-        }
        kaddr = kmap_atomic(page, KM_USER0);
        memcpy(kaddr, symname, pathlen);
@@ -1512,7 +1496,6 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym
                        dentry->d_name.name, symname, error);
                d_drop(dentry);
                __free_page(page);
-                unlock_kernel();
                return error;
        }
@@ -1530,7 +1513,6 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym
        } else
                __free_page(page);
-        unlock_kernel();
        return 0;
 }
@@ -1544,14 +1526,12 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
                old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
                dentry->d_parent->d_name.name, dentry->d_name.name);
-        lock_kernel();
        d_drop(dentry);
        error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name);
        if (error == 0) {
                atomic_inc(&inode->i_count);
                d_add(dentry, inode);
        }
-        unlock_kernel();
        return error;
 }
@@ -1591,7 +1571,6 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
         * To prevent any new references to the target during the rename,
         * we unhash the dentry and free the inode in advance.
         */
-        lock_kernel();
        if (!d_unhashed(new_dentry)) {
                d_drop(new_dentry);
                rehash = new_dentry;
@@ -1635,7 +1614,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                        /* dentry still busy? */
                        goto out;
        } else
-                drop_nlink(new_inode);
+                nfs_drop_nlink(new_inode);
 go_ahead:
        /*
@@ -1669,7 +1648,6 @@ out:
        /* new dentry created? */
        if (dentry)
                dput(dentry);
-        unlock_kernel();
        return error;
 }
@@ -1962,8 +1940,6 @@ int nfs_permission(struct inode *inode, int mask, struct nameidata *nd)
        }
 force_lookup:
-        lock_kernel();
        if (!NFS_PROTO(inode)->access)
                goto out_notsup;
@@ -1973,7 +1949,6 @@ force_lookup:
                put_rpccred(cred);
        } else
                res = PTR_ERR(cred);
-        unlock_kernel();
 out:
        dfprintk(VFS, "NFS: permission(%s/%ld), mask=0x%x, res=%d\n",
                inode->i_sb->s_id, inode->i_ino, mask, res);
@@ -1982,7 +1957,6 @@ out_notsup:
        res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
        if (res == 0)
                res = generic_permission(inode, mask, NULL);
-        unlock_kernel();
        goto out;
 }
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 4757a2b326a1..08f6b040d289 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -890,7 +890,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
        count = iov_length(iov, nr_segs);
        nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
-        dprintk("nfs: direct read(%s/%s, %zd@%Ld)\n",
+        dfprintk(FILE, "NFS: direct read(%s/%s, %zd@%Ld)\n",
                file->f_path.dentry->d_parent->d_name.name,
                file->f_path.dentry->d_name.name,
                count, (long long) pos);
@@ -947,7 +947,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
        count = iov_length(iov, nr_segs);
        nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
-        dfprintk(VFS, "nfs: direct write(%s/%s, %zd@%Ld)\n",
+        dfprintk(FILE, "NFS: direct write(%s/%s, %zd@%Ld)\n",
                file->f_path.dentry->d_parent->d_name.name,
                file->f_path.dentry->d_name.name,
                count, (long long) pos);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index d84a3d8f32af..78460657f5cb 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -50,7 +50,7 @@ static ssize_t nfs_file_read(struct kiocb *, const struct iovec *iov,
 static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov,
                                unsigned long nr_segs, loff_t pos);
 static int  nfs_file_flush(struct file *, fl_owner_t id);
-static int  nfs_fsync(struct file *, struct dentry *dentry, int datasync);
+static int  nfs_file_fsync(struct file *, struct dentry *dentry, int datasync);
 static int nfs_check_flags(int flags);
 static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
 static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl);
@@ -72,7 +72,7 @@ const struct file_operations nfs_file_operations = {
        .open           = nfs_file_open,
        .flush          = nfs_file_flush,
        .release        = nfs_file_release,
-        .fsync          = nfs_fsync,
+        .fsync          = nfs_file_fsync,
        .lock           = nfs_lock,
        .flock          = nfs_flock,
        .splice_read    = nfs_file_splice_read,
@@ -119,25 +119,33 @@ nfs_file_open(struct inode *inode, struct file *filp)
 {
        int res;
+        dprintk("NFS: open file(%s/%s)\n",
+                        filp->f_path.dentry->d_parent->d_name.name,
+                        filp->f_path.dentry->d_name.name);
        res = nfs_check_flags(filp->f_flags);
        if (res)
                return res;
        nfs_inc_stats(inode, NFSIOS_VFSOPEN);
-        lock_kernel();
+        res = nfs_open(inode, filp);
-        res = NFS_PROTO(inode)->file_open(inode, filp);
-        unlock_kernel();
        return res;
 }
 static int
 nfs_file_release(struct inode *inode, struct file *filp)
 {
+        struct dentry *dentry = filp->f_path.dentry;
+        dprintk("NFS: release(%s/%s)\n",
+                        dentry->d_parent->d_name.name,
+                        dentry->d_name.name);
        /* Ensure that dirty pages are flushed out with the right creds */
        if (filp->f_mode & FMODE_WRITE)
-                nfs_wb_all(filp->f_path.dentry->d_inode);
+                nfs_wb_all(dentry->d_inode);
        nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
-        return NFS_PROTO(inode)->file_release(inode, filp);
+        return nfs_release(inode, filp);
 }
 /**
@@ -170,6 +178,13 @@ force_reval:
 static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
 {
+        loff_t loff;
+        dprintk("NFS: llseek file(%s/%s, %lld, %d)\n",
+                        filp->f_path.dentry->d_parent->d_name.name,
+                        filp->f_path.dentry->d_name.name,
+                        offset, origin);
        /* origin == SEEK_END => we must revalidate the cached file length */
        if (origin == SEEK_END) {
                struct inode *inode = filp->f_mapping->host;
@@ -177,11 +192,14 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
                if (retval < 0)
                        return (loff_t)retval;
        }
-        return remote_llseek(filp, offset, origin);
+        lock_kernel();  /* BKL needed? */
+        loff = generic_file_llseek_unlocked(filp, offset, origin);
+        unlock_kernel();
+        return loff;
 }
 /*
- * Helper for nfs_file_flush() and nfs_fsync()
+ * Helper for nfs_file_flush() and nfs_file_fsync()
 *
 * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to
 * disk, but it retrieves and clears ctx->error after synching, despite
@@ -207,16 +225,18 @@ static int nfs_do_fsync(struct nfs_open_context *ctx, struct inode *inode)
 /*
 * Flush all dirty pages, and check for write errors.
- *
 */
 static int
 nfs_file_flush(struct file *file, fl_owner_t id)
 {
        struct nfs_open_context *ctx = nfs_file_open_context(file);
-        struct inode    *inode = file->f_path.dentry->d_inode;
+        struct dentry   *dentry = file->f_path.dentry;
+        struct inode    *inode = dentry->d_inode;
        int             status;
-        dfprintk(VFS, "nfs: flush(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino);
+        dprintk("NFS: flush(%s/%s)\n",
+                        dentry->d_parent->d_name.name,
+                        dentry->d_name.name);
        if ((file->f_mode & FMODE_WRITE) == 0)
                return 0;
@@ -241,7 +261,7 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov,
        if (iocb->ki_filp->f_flags & O_DIRECT)
                return nfs_file_direct_read(iocb, iov, nr_segs, pos);
-        dfprintk(VFS, "nfs: read(%s/%s, %lu@%lu)\n",
+        dprintk("NFS: read(%s/%s, %lu@%lu)\n",
                dentry->d_parent->d_name.name, dentry->d_name.name,
                (unsigned long) count, (unsigned long) pos);
@@ -261,7 +281,7 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos,
        struct inode *inode = dentry->d_inode;
        ssize_t res;
-        dfprintk(VFS, "nfs: splice_read(%s/%s, %lu@%Lu)\n",
+        dprintk("NFS: splice_read(%s/%s, %lu@%Lu)\n",
                dentry->d_parent->d_name.name, dentry->d_name.name,
                (unsigned long) count, (unsigned long long) *ppos);
@@ -278,7 +298,7 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
        struct inode *inode = dentry->d_inode;
        int     status;
-        dfprintk(VFS, "nfs: mmap(%s/%s)\n",
+        dprintk("NFS: mmap(%s/%s)\n",
                dentry->d_parent->d_name.name, dentry->d_name.name);
        status = nfs_revalidate_mapping(inode, file->f_mapping);
@@ -296,12 +316,14 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
 * whether any write errors occurred for this process.
 */
 static int
-nfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+nfs_file_fsync(struct file *file, struct dentry *dentry, int datasync)
 {
        struct nfs_open_context *ctx = nfs_file_open_context(file);
        struct inode *inode = dentry->d_inode;
-        dfprintk(VFS, "nfs: fsync(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino);
+        dprintk("NFS: fsync file(%s/%s) datasync %d\n",
+                        dentry->d_parent->d_name.name, dentry->d_name.name,
+                        datasync);
        nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
        return nfs_do_fsync(ctx, inode);
@@ -324,6 +346,11 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
        struct page *page;
        index = pos >> PAGE_CACHE_SHIFT;
+        dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n",
+                file->f_path.dentry->d_parent->d_name.name,
+                file->f_path.dentry->d_name.name,
+                mapping->host->i_ino, len, (long long) pos);
        page = __grab_cache_page(mapping, index);
        if (!page)
                return -ENOMEM;
@@ -344,9 +371,32 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
        unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
        int status;
-        lock_kernel();
+        dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n",
+                file->f_path.dentry->d_parent->d_name.name,
+                file->f_path.dentry->d_name.name,
+                mapping->host->i_ino, len, (long long) pos);
+        /*
+         * Zero any uninitialised parts of the page, and then mark the page
+         * as up to date if it turns out that we're extending the file.
+         */
+        if (!PageUptodate(page)) {
+                unsigned pglen = nfs_page_length(page);
+                unsigned end = offset + len;
+                if (pglen == 0) {
+                        zero_user_segments(page, 0, offset,
+                                        end, PAGE_CACHE_SIZE);
+                        SetPageUptodate(page);
+                } else if (end >= pglen) {
+                        zero_user_segment(page, end, PAGE_CACHE_SIZE);
+                        if (offset == 0)
+                                SetPageUptodate(page);
+                } else
+                        zero_user_segment(page, pglen, PAGE_CACHE_SIZE);
+        }
        status = nfs_updatepage(file, page, offset, copied);
-        unlock_kernel();
        unlock_page(page);
        page_cache_release(page);
@@ -358,6 +408,8 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
 static void nfs_invalidate_page(struct page *page, unsigned long offset)
 {
+        dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset);
        if (offset != 0)
                return;
        /* Cancel any unstarted writes on this page */
@@ -366,13 +418,20 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset)
 static int nfs_release_page(struct page *page, gfp_t gfp)
 {
+        dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
        /* If PagePrivate() is set, then the page is not freeable */
        return 0;
 }
 static int nfs_launder_page(struct page *page)
 {
-        return nfs_wb_page(page->mapping->host, page);
+        struct inode *inode = page->mapping->host;
+        dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n",
+                inode->i_ino, (long long)page_offset(page));
+        return nfs_wb_page(inode, page);
 }
 const struct address_space_operations nfs_file_aops = {
@@ -392,13 +451,19 @@ const struct address_space_operations nfs_file_aops = {
 static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 {
        struct file *filp = vma->vm_file;
+        struct dentry *dentry = filp->f_path.dentry;
        unsigned pagelen;
        int ret = -EINVAL;
        struct address_space *mapping;
+        dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%s/%s(%ld), offset %lld)\n",
+                dentry->d_parent->d_name.name, dentry->d_name.name,
+                filp->f_mapping->host->i_ino,
+                (long long)page_offset(page));
        lock_page(page);
        mapping = page->mapping;
-        if (mapping != vma->vm_file->f_path.dentry->d_inode->i_mapping)
+        if (mapping != dentry->d_inode->i_mapping)
                goto out_unlock;
        ret = 0;
@@ -446,9 +511,9 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
        if (iocb->ki_filp->f_flags & O_DIRECT)
                return nfs_file_direct_write(iocb, iov, nr_segs, pos);
-        dfprintk(VFS, "nfs: write(%s/%s(%ld), %lu@%Ld)\n",
+        dprintk("NFS: write(%s/%s, %lu@%Ld)\n",
                dentry->d_parent->d_name.name, dentry->d_name.name,
-                inode->i_ino, (unsigned long) count, (long long) pos);
+                (unsigned long) count, (long long) pos);
        result = -EBUSY;
        if (IS_SWAPFILE(inode))
@@ -582,7 +647,8 @@ static int do_setlk(struct file *filp, int cmd, struct file_lock *fl)
         * This makes locking act as a cache coherency point.
         */
        nfs_sync_mapping(filp->f_mapping);
-        nfs_zap_caches(inode);
+        if (!nfs_have_delegation(inode, FMODE_READ))
+                nfs_zap_caches(inode);
 out:
        return status;
 }
@@ -592,23 +658,35 @@ out:
 */
 static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
 {
-        struct inode * inode = filp->f_mapping->host;
+        struct inode *inode = filp->f_mapping->host;
+        int ret = -ENOLCK;
-        dprintk("NFS: nfs_lock(f=%s/%ld, t=%x, fl=%x, r=%Ld:%Ld)\n",
+        dprintk("NFS: lock(%s/%s, t=%x, fl=%x, r=%lld:%lld)\n",
-                        inode->i_sb->s_id, inode->i_ino,
+                        filp->f_path.dentry->d_parent->d_name.name,
+                        filp->f_path.dentry->d_name.name,
                        fl->fl_type, fl->fl_flags,
                        (long long)fl->fl_start, (long long)fl->fl_end);
        nfs_inc_stats(inode, NFSIOS_VFSLOCK);
        /* No mandatory locks over NFS */
        if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
-                return -ENOLCK;
+                goto out_err;
+        if (NFS_PROTO(inode)->lock_check_bounds != NULL) {
+                ret = NFS_PROTO(inode)->lock_check_bounds(fl);
+                if (ret < 0)
+                        goto out_err;
+        }
        if (IS_GETLK(cmd))
-                return do_getlk(filp, cmd, fl);
+                ret = do_getlk(filp, cmd, fl);
-        if (fl->fl_type == F_UNLCK)
+        else if (fl->fl_type == F_UNLCK)
-                return do_unlk(filp, cmd, fl);
+                ret = do_unlk(filp, cmd, fl);
-        return do_setlk(filp, cmd, fl);
+        else
+                ret = do_setlk(filp, cmd, fl);
+out_err:
+        return ret;
 }
 /*
@@ -616,9 +694,9 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
 */
 static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
 {
-        dprintk("NFS: nfs_flock(f=%s/%ld, t=%x, fl=%x)\n",
+        dprintk("NFS: flock(%s/%s, t=%x, fl=%x)\n",
-                        filp->f_path.dentry->d_inode->i_sb->s_id,
+                        filp->f_path.dentry->d_parent->d_name.name,
-                        filp->f_path.dentry->d_inode->i_ino,
+                        filp->f_path.dentry->d_name.name,
                        fl->fl_type, fl->fl_flags);
        /*
@@ -641,12 +719,15 @@ static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
        return do_setlk(filp, cmd, fl);
 }
+/*
+ * There is no protocol support for leases, so we have no way to implement
+ * them correctly in the face of opens by other clients.
+ */
 static int nfs_setlease(struct file *file, long arg, struct file_lock **fl)
 {
-        /*
+        dprintk("NFS: setlease(%s/%s, arg=%ld)\n",
-         * There is no protocol support for leases, so we have no way
+                        file->f_path.dentry->d_parent->d_name.name,
-         * to implement them correctly in the face of opens by other
+                        file->f_path.dentry->d_name.name, arg);
-         * clients.
-         */
        return -EINVAL;
 }
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 596c5d8e86f4..df23f987da6b 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -57,8 +57,6 @@ static int enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED;
 static void nfs_invalidate_inode(struct inode *);
 static int nfs_update_inode(struct inode *, struct nfs_fattr *);
-static void nfs_zap_acl_cache(struct inode *);
 static struct kmem_cache * nfs_inode_cachep;
 static inline unsigned long
@@ -167,7 +165,7 @@ void nfs_zap_mapping(struct inode *inode, struct address_space *mapping)
        }
 }
-static void nfs_zap_acl_cache(struct inode *inode)
+void nfs_zap_acl_cache(struct inode *inode)
 {
        void (*clear_acl_cache)(struct inode *);
@@ -347,7 +345,7 @@ out_no_inode:
        goto out;
 }
-#define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET)
+#define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_FILE)
 int
 nfs_setattr(struct dentry *dentry, struct iattr *attr)
@@ -369,10 +367,9 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
        /* Optimization: if the end result is no change, don't RPC */
        attr->ia_valid &= NFS_VALID_ATTRS;
-        if (attr->ia_valid == 0)
+        if ((attr->ia_valid & ~ATTR_FILE) == 0)
                return 0;
-        lock_kernel();
        /* Write all dirty data */
        if (S_ISREG(inode->i_mode)) {
                filemap_write_and_wait(inode->i_mapping);
@@ -386,11 +383,66 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
        error = NFS_PROTO(inode)->setattr(dentry, &fattr, attr);
        if (error == 0)
                nfs_refresh_inode(inode, &fattr);
-        unlock_kernel();
        return error;
 }
 /**
+ * nfs_vmtruncate - unmap mappings "freed" by truncate() syscall
+ * @inode: inode of the file used
+ * @offset: file offset to start truncating
+ *
+ * This is a copy of the common vmtruncate, but with the locking
+ * corrected to take into account the fact that NFS requires
+ * inode->i_size to be updated under the inode->i_lock.
+ */
+static int nfs_vmtruncate(struct inode * inode, loff_t offset)
+{
+        if (i_size_read(inode) < offset) {
+                unsigned long limit;
+                limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+                if (limit != RLIM_INFINITY && offset > limit)
+                        goto out_sig;
+                if (offset > inode->i_sb->s_maxbytes)
+                        goto out_big;
+                spin_lock(&inode->i_lock);
+                i_size_write(inode, offset);
+                spin_unlock(&inode->i_lock);
+        } else {
+                struct address_space *mapping = inode->i_mapping;
+                /*
+                 * truncation of in-use swapfiles is disallowed - it would
+                 * cause subsequent swapout to scribble on the now-freed
+                 * blocks.
+                 */
+                if (IS_SWAPFILE(inode))
+                        return -ETXTBSY;
+                spin_lock(&inode->i_lock);
+                i_size_write(inode, offset);
+                spin_unlock(&inode->i_lock);
+                /*
+                 * unmap_mapping_range is called twice, first simply for
+                 * efficiency so that truncate_inode_pages does fewer
+                 * single-page unmaps.  However after this first call, and
+                 * before truncate_inode_pages finishes, it is possible for
+                 * private pages to be COWed, which remain after
+                 * truncate_inode_pages finishes, hence the second
+                 * unmap_mapping_range call must be made for correctness.
+                 */
+                unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
+                truncate_inode_pages(mapping, offset);
+                unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
+        }
+        return 0;
+out_sig:
+        send_sig(SIGXFSZ, current, 0);
+out_big:
+        return -EFBIG;
+}
+/**
 * nfs_setattr_update_inode - Update inode metadata after a setattr call.
 * @inode: pointer to struct inode
 * @attr: pointer to struct iattr
@@ -416,8 +468,7 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
        }
        if ((attr->ia_valid & ATTR_SIZE) != 0) {
                nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC);
-                inode->i_size = attr->ia_size;
+                nfs_vmtruncate(inode, attr->ia_size);
-                vmtruncate(inode, attr->ia_size);
        }
 }
@@ -647,7 +698,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
                inode->i_sb->s_id, (long long)NFS_FILEID(inode));
        nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
-        lock_kernel();
        if (is_bad_inode(inode))
                goto out_nowait;
        if (NFS_STALE(inode))
@@ -696,7 +746,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
        nfs_wake_up_inode(inode);
 out_nowait:
-        unlock_kernel();
        return status;
 }
@@ -831,9 +880,9 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                        if (S_ISDIR(inode->i_mode))
                                nfsi->cache_validity |= NFS_INO_INVALID_DATA;
                }
-                if (inode->i_size == nfs_size_to_loff_t(fattr->pre_size) &&
+                if (i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) &&
                    nfsi->npages == 0)
-                        inode->i_size = nfs_size_to_loff_t(fattr->size);
+                        i_size_write(inode, nfs_size_to_loff_t(fattr->size));
        }
 }
@@ -974,7 +1023,7 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa
                        (fattr->valid & NFS_ATTR_WCC) == 0) {
                memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime));
                memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime));
-                fattr->pre_size = inode->i_size;
+                fattr->pre_size = i_size_read(inode);
                fattr->valid |= NFS_ATTR_WCC;
        }
        return nfs_post_op_update_inode(inode, fattr);
@@ -1059,7 +1108,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                /* Do we perhaps have any outstanding writes, or has
                 * the file grown beyond our last write? */
                if (nfsi->npages == 0 || new_isize > cur_isize) {
-                        inode->i_size = new_isize;
+                        i_size_write(inode, new_isize);
                        invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
                }
                dprintk("NFS: isize change on server for file %s/%ld\n",
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 04ae867dddba..24241fcbb98d 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -150,6 +150,7 @@ extern void nfs_clear_inode(struct inode *);
 #ifdef CONFIG_NFS_V4
 extern void nfs4_clear_inode(struct inode *);
 #endif
+void nfs_zap_acl_cache(struct inode *inode);
 /* super.c */
 extern struct file_system_type nfs_xdev_fs_type;
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index 6350ecbde589..a36952810032 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -5,135 +5,41 @@
 *
 *  Copyright (C) 2005, 2006 Chuck Lever <cel@netapp.com>
 *
- *  NFS client per-mount statistics provide information about the health of
- *  the NFS client and the health of each NFS mount point.  Generally these
- *  are not for detailed problem diagnosis, but simply to indicate that there
- *  is a problem.
- *
- *  These counters are not meant to be human-readable, but are meant to be
- *  integrated into system monitoring tools such as "sar" and "iostat".  As
- *  such, the counters are sampled by the tools over time, and are never
- *  zeroed after a file system is mounted.  Moving averages can be computed
- *  by the tools by taking the difference between two instantaneous samples
- *  and dividing that by the time between the samples.
 */
 #ifndef _NFS_IOSTAT
 #define _NFS_IOSTAT
-#define NFS_IOSTAT_VERS         "1.0"
-/*
- * NFS byte counters
- *
- * 1.  SERVER - the number of payload bytes read from or written to the
- *     server by the NFS client via an NFS READ or WRITE request.
- *
- * 2.  NORMAL - the number of bytes read or written by applications via
- *     the read(2) and write(2) system call interfaces.
- *
- * 3.  DIRECT - the number of bytes read or written from files opened
- *     with the O_DIRECT flag.
- *
- * These counters give a view of the data throughput into and out of the NFS
- * client.  Comparing the number of bytes requested by an application with the
- * number of bytes the client requests from the server can provide an
- * indication of client efficiency (per-op, cache hits, etc).
- *
- * These counters can also help characterize which access methods are in
- * use.  DIRECT by itself shows whether there is any O_DIRECT traffic.
- * NORMAL + DIRECT shows how much data is going through the system call
- * interface.  A large amount of SERVER traffic without much NORMAL or
- * DIRECT traffic shows that applications are using mapped files.
- *
- * NFS page counters
- *
- * These count the number of pages read or written via nfs_readpage(),
- * nfs_readpages(), or their write equivalents.
- */
-enum nfs_stat_bytecounters {
-        NFSIOS_NORMALREADBYTES = 0,
-        NFSIOS_NORMALWRITTENBYTES,
-        NFSIOS_DIRECTREADBYTES,
-        NFSIOS_DIRECTWRITTENBYTES,
-        NFSIOS_SERVERREADBYTES,
-        NFSIOS_SERVERWRITTENBYTES,
-        NFSIOS_READPAGES,
-        NFSIOS_WRITEPAGES,
-        __NFSIOS_BYTESMAX,
-};
-/*
- * NFS event counters
- *
- * These counters provide a low-overhead way of monitoring client activity
- * without enabling NFS trace debugging.  The counters show the rate at
- * which VFS requests are made, and how often the client invalidates its
- * data and attribute caches.  This allows system administrators to monitor
- * such things as how close-to-open is working, and answer questions such
- * as "why are there so many GETATTR requests on the wire?"
- *
- * They also count anamolous events such as short reads and writes, silly
- * renames due to close-after-delete, and operations that change the size
- * of a file (such operations can often be the source of data corruption
- * if applications aren't using file locking properly).
- */
-enum nfs_stat_eventcounters {
-        NFSIOS_INODEREVALIDATE = 0,
-        NFSIOS_DENTRYREVALIDATE,
-        NFSIOS_DATAINVALIDATE,
-        NFSIOS_ATTRINVALIDATE,
-        NFSIOS_VFSOPEN,
-        NFSIOS_VFSLOOKUP,
-        NFSIOS_VFSACCESS,
-        NFSIOS_VFSUPDATEPAGE,
-        NFSIOS_VFSREADPAGE,
-        NFSIOS_VFSREADPAGES,
-        NFSIOS_VFSWRITEPAGE,
-        NFSIOS_VFSWRITEPAGES,
-        NFSIOS_VFSGETDENTS,
-        NFSIOS_VFSSETATTR,
-        NFSIOS_VFSFLUSH,
-        NFSIOS_VFSFSYNC,
-        NFSIOS_VFSLOCK,
-        NFSIOS_VFSRELEASE,
-        NFSIOS_CONGESTIONWAIT,
-        NFSIOS_SETATTRTRUNC,
-        NFSIOS_EXTENDWRITE,
-        NFSIOS_SILLYRENAME,
-        NFSIOS_SHORTREAD,
-        NFSIOS_SHORTWRITE,
-        NFSIOS_DELAY,
-        __NFSIOS_COUNTSMAX,
-};
-#ifdef __KERNEL__
 #include <linux/percpu.h>
 #include <linux/cache.h>
+#include <linux/nfs_iostat.h>
 struct nfs_iostats {
        unsigned long long      bytes[__NFSIOS_BYTESMAX];
        unsigned long           events[__NFSIOS_COUNTSMAX];
 } ____cacheline_aligned;
-static inline void nfs_inc_server_stats(struct nfs_server *server, enum nfs_stat_eventcounters stat)
+static inline void nfs_inc_server_stats(const struct nfs_server *server,
+                                        enum nfs_stat_eventcounters stat)
 {
        struct nfs_iostats *iostats;
        int cpu;
        cpu = get_cpu();
        iostats = per_cpu_ptr(server->io_stats, cpu);
-        iostats->events[stat] ++;
+        iostats->events[stat]++;
        put_cpu_no_resched();
 }
-static inline void nfs_inc_stats(struct inode *inode, enum nfs_stat_eventcounters stat)
+static inline void nfs_inc_stats(const struct inode *inode,
+                                 enum nfs_stat_eventcounters stat)
 {
        nfs_inc_server_stats(NFS_SERVER(inode), stat);
 }
-static inline void nfs_add_server_stats(struct nfs_server *server, enum nfs_stat_bytecounters stat, unsigned long addend)
+static inline void nfs_add_server_stats(const struct nfs_server *server,
+                                        enum nfs_stat_bytecounters stat,
+                                        unsigned long addend)
 {
        struct nfs_iostats *iostats;
        int cpu;
@@ -144,7 +50,9 @@ static inline void nfs_add_server_stats(struct nfs_server *server, enum nfs_stat
        put_cpu_no_resched();
 }
-static inline void nfs_add_stats(struct inode *inode, enum nfs_stat_bytecounters stat, unsigned long addend)
+static inline void nfs_add_stats(const struct inode *inode,
+                                 enum nfs_stat_bytecounters stat,
+                                 unsigned long addend)
 {
        nfs_add_server_stats(NFS_SERVER(inode), stat, addend);
 }
@@ -160,5 +68,4 @@ static inline void nfs_free_iostats(struct nfs_iostats *stats)
                free_percpu(stats);
 }
-#endif
+#endif /* _NFS_IOSTAT */
-#endif
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 49c7cd0502cc..779d2eb649c5 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -130,10 +130,11 @@ static int xdr_decode_fhstatus3(struct rpc_rqst *req, __be32 *p,
                                struct mnt_fhstatus *res)
 {
        struct nfs_fh *fh = res->fh;
+        unsigned size;
        if ((res->status = ntohl(*p++)) == 0) {
-                int size = ntohl(*p++);
+                size = ntohl(*p++);
-                if (size <= NFS3_FHSIZE) {
+                if (size <= NFS3_FHSIZE && size != 0) {
                        fh->size = size;
                        memcpy(fh->data, p, size);
                } else
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 9b7362565c0c..423842f51ac9 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -5,6 +5,8 @@
 #include <linux/posix_acl_xattr.h>
 #include <linux/nfsacl.h>
+#include "internal.h"
 #define NFSDBG_FACILITY NFSDBG_PROC
 ssize_t nfs3_listxattr(struct dentry *dentry, char *buffer, size_t size)
@@ -205,6 +207,8 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
        status = nfs_revalidate_inode(server, inode);
        if (status < 0)
                return ERR_PTR(status);
+        if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL)
+                nfs_zap_acl_cache(inode);
        acl = nfs3_get_cached_acl(inode, type);
        if (acl != ERR_PTR(-EAGAIN))
                return acl;
@@ -319,9 +323,8 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
        dprintk("NFS call setacl\n");
        msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL];
        status = rpc_call_sync(server->client_acl, &msg, 0);
-        spin_lock(&inode->i_lock);
+        nfs_access_zap_cache(inode);
-        NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS;
+        nfs_zap_acl_cache(inode);
-        spin_unlock(&inode->i_lock);
        dprintk("NFS reply setacl: %d\n", status);
        /* pages may have been allocated at the xdr layer. */
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index c3523ad03ed1..1e750e4574a9 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -129,6 +129,8 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
        int     status;
        dprintk("NFS call  setattr\n");
+        if (sattr->ia_valid & ATTR_FILE)
+                msg.rpc_cred = nfs_file_cred(sattr->ia_file);
        nfs_fattr_init(fattr);
        status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
        if (status == 0)
@@ -248,6 +250,53 @@ static int nfs3_proc_readlink(struct inode *inode, struct page *page,
        return status;
 }
+struct nfs3_createdata {
+        struct rpc_message msg;
+        union {
+                struct nfs3_createargs create;
+                struct nfs3_mkdirargs mkdir;
+                struct nfs3_symlinkargs symlink;
+                struct nfs3_mknodargs mknod;
+        } arg;
+        struct nfs3_diropres res;
+        struct nfs_fh fh;
+        struct nfs_fattr fattr;
+        struct nfs_fattr dir_attr;
+};
+static struct nfs3_createdata *nfs3_alloc_createdata(void)
+{
+        struct nfs3_createdata *data;
+        data = kzalloc(sizeof(*data), GFP_KERNEL);
+        if (data != NULL) {
+                data->msg.rpc_argp = &data->arg;
+                data->msg.rpc_resp = &data->res;
+                data->res.fh = &data->fh;
+                data->res.fattr = &data->fattr;
+                data->res.dir_attr = &data->dir_attr;
+                nfs_fattr_init(data->res.fattr);
+                nfs_fattr_init(data->res.dir_attr);
+        }
+        return data;
+}
+static int nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_createdata *data)
+{
+        int status;
+        status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0);
+        nfs_post_op_update_inode(dir, data->res.dir_attr);
+        if (status == 0)
+                status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+        return status;
+}
+static void nfs3_free_createdata(struct nfs3_createdata *data)
+{
+        kfree(data);
+}
 /*
 * Create a regular file.
 * For now, we don't implement O_EXCL.
@@ -256,70 +305,60 @@ static int
 nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
                 int flags, struct nameidata *nd)
 {
-        struct nfs_fh           fhandle;
+        struct nfs3_createdata *data;
-        struct nfs_fattr        fattr;
-        struct nfs_fattr        dir_attr;
-        struct nfs3_createargs  arg = {
-                .fh             = NFS_FH(dir),
-                .name           = dentry->d_name.name,
-                .len            = dentry->d_name.len,
-                .sattr          = sattr,
-        };
-        struct nfs3_diropres    res = {
-                .dir_attr       = &dir_attr,
-                .fh             = &fhandle,
-                .fattr          = &fattr
-        };
-        struct rpc_message msg = {
-                .rpc_proc       = &nfs3_procedures[NFS3PROC_CREATE],
-                .rpc_argp       = &arg,
-                .rpc_resp       = &res,
-        };
        mode_t mode = sattr->ia_mode;
-        int status;
+        int status = -ENOMEM;
        dprintk("NFS call  create %s\n", dentry->d_name.name);
-        arg.createmode = NFS3_CREATE_UNCHECKED;
+        data = nfs3_alloc_createdata();
+        if (data == NULL)
+                goto out;
+        data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_CREATE];
+        data->arg.create.fh = NFS_FH(dir);
+        data->arg.create.name = dentry->d_name.name;
+        data->arg.create.len = dentry->d_name.len;
+        data->arg.create.sattr = sattr;
+        data->arg.create.createmode = NFS3_CREATE_UNCHECKED;
        if (flags & O_EXCL) {
-                arg.createmode  = NFS3_CREATE_EXCLUSIVE;
+                data->arg.create.createmode  = NFS3_CREATE_EXCLUSIVE;
-                arg.verifier[0] = jiffies;
+                data->arg.create.verifier[0] = jiffies;
-                arg.verifier[1] = current->pid;
+                data->arg.create.verifier[1] = current->pid;
        }
        sattr->ia_mode &= ~current->fs->umask;
-again:
+        for (;;) {
-        nfs_fattr_init(&dir_attr);
+                status = nfs3_do_create(dir, dentry, data);
-        nfs_fattr_init(&fattr);
-        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
-        nfs_refresh_inode(dir, &dir_attr);
-        /* If the server doesn't support the exclusive creation semantics,
+                if (status != -ENOTSUPP)
-         * try again with simple 'guarded' mode. */
+                        break;
-        if (status == -ENOTSUPP) {
+                /* If the server doesn't support the exclusive creation
-                switch (arg.createmode) {
+                 * semantics, try again with simple 'guarded' mode. */
+                switch (data->arg.create.createmode) {
                        case NFS3_CREATE_EXCLUSIVE:
-                                arg.createmode = NFS3_CREATE_GUARDED;
+                                data->arg.create.createmode = NFS3_CREATE_GUARDED;
                                break;
                        case NFS3_CREATE_GUARDED:
-                                arg.createmode = NFS3_CREATE_UNCHECKED;
+                                data->arg.create.createmode = NFS3_CREATE_UNCHECKED;
                                break;
                        case NFS3_CREATE_UNCHECKED:
                                goto out;
                }
-                goto again;
+                nfs_fattr_init(data->res.dir_attr);
+                nfs_fattr_init(data->res.fattr);
        }
-        if (status == 0)
-                status = nfs_instantiate(dentry, &fhandle, &fattr);
        if (status != 0)
                goto out;
        /* When we created the file with exclusive semantics, make
         * sure we set the attributes afterwards. */
-        if (arg.createmode == NFS3_CREATE_EXCLUSIVE) {
+        if (data->arg.create.createmode == NFS3_CREATE_EXCLUSIVE) {
                dprintk("NFS call  setattr (post-create)\n");
                if (!(sattr->ia_valid & ATTR_ATIME_SET))
@@ -330,14 +369,15 @@ again:
                /* Note: we could use a guarded setattr here, but I'm
                 * not sure this buys us anything (and I'd have
                 * to revamp the NFSv3 XDR code) */
-                status = nfs3_proc_setattr(dentry, &fattr, sattr);
+                status = nfs3_proc_setattr(dentry, data->res.fattr, sattr);
-                nfs_post_op_update_inode(dentry->d_inode, &fattr);
+                nfs_post_op_update_inode(dentry->d_inode, data->res.fattr);
                dprintk("NFS reply setattr (post-create): %d\n", status);
+                if (status != 0)
+                        goto out;
        }
-        if (status != 0)
-                goto out;
        status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode);
 out:
+        nfs3_free_createdata(data);
        dprintk("NFS reply create: %d\n", status);
        return status;
 }
@@ -452,40 +492,28 @@ static int
 nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
                  unsigned int len, struct iattr *sattr)
 {
-        struct nfs_fh fhandle;
+        struct nfs3_createdata *data;
-        struct nfs_fattr fattr, dir_attr;
+        int status = -ENOMEM;
-        struct nfs3_symlinkargs arg = {
-                .fromfh         = NFS_FH(dir),
-                .fromname       = dentry->d_name.name,
-                .fromlen        = dentry->d_name.len,
-                .pages          = &page,
-                .pathlen        = len,
-                .sattr          = sattr
-        };
-        struct nfs3_diropres    res = {
-                .dir_attr       = &dir_attr,
-                .fh             = &fhandle,
-                .fattr          = &fattr
-        };
-        struct rpc_message msg = {
-                .rpc_proc       = &nfs3_procedures[NFS3PROC_SYMLINK],
-                .rpc_argp       = &arg,
-                .rpc_resp       = &res,
-        };
-        int                     status;
        if (len > NFS3_MAXPATHLEN)
                return -ENAMETOOLONG;
        dprintk("NFS call  symlink %s\n", dentry->d_name.name);
-        nfs_fattr_init(&dir_attr);
+        data = nfs3_alloc_createdata();
-        nfs_fattr_init(&fattr);
+        if (data == NULL)
-        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
-        nfs_post_op_update_inode(dir, &dir_attr);
-        if (status != 0)
                goto out;
-        status = nfs_instantiate(dentry, &fhandle, &fattr);
+        data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_SYMLINK];
+        data->arg.symlink.fromfh = NFS_FH(dir);
+        data->arg.symlink.fromname = dentry->d_name.name;
+        data->arg.symlink.fromlen = dentry->d_name.len;
+        data->arg.symlink.pages = &page;
+        data->arg.symlink.pathlen = len;
+        data->arg.symlink.sattr = sattr;
+        status = nfs3_do_create(dir, dentry, data);
+        nfs3_free_createdata(data);
 out:
        dprintk("NFS reply symlink: %d\n", status);
        return status;
@@ -494,42 +522,31 @@ out:
 static int
 nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
 {
-        struct nfs_fh fhandle;
+        struct nfs3_createdata *data;
-        struct nfs_fattr fattr, dir_attr;
-        struct nfs3_mkdirargs   arg = {
-                .fh             = NFS_FH(dir),
-                .name           = dentry->d_name.name,
-                .len            = dentry->d_name.len,
-                .sattr          = sattr
-        };
-        struct nfs3_diropres    res = {
-                .dir_attr       = &dir_attr,
-                .fh             = &fhandle,
-                .fattr          = &fattr
-        };
-        struct rpc_message msg = {
-                .rpc_proc       = &nfs3_procedures[NFS3PROC_MKDIR],
-                .rpc_argp       = &arg,
-                .rpc_resp       = &res,
-        };
        int mode = sattr->ia_mode;
-        int status;
+        int status = -ENOMEM;
        dprintk("NFS call  mkdir %s\n", dentry->d_name.name);
        sattr->ia_mode &= ~current->fs->umask;
-        nfs_fattr_init(&dir_attr);
+        data = nfs3_alloc_createdata();
-        nfs_fattr_init(&fattr);
+        if (data == NULL)
-        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
-        nfs_post_op_update_inode(dir, &dir_attr);
-        if (status != 0)
                goto out;
-        status = nfs_instantiate(dentry, &fhandle, &fattr);
+        data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKDIR];
+        data->arg.mkdir.fh = NFS_FH(dir);
+        data->arg.mkdir.name = dentry->d_name.name;
+        data->arg.mkdir.len = dentry->d_name.len;
+        data->arg.mkdir.sattr = sattr;
+        status = nfs3_do_create(dir, dentry, data);
        if (status != 0)
                goto out;
        status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode);
 out:
+        nfs3_free_createdata(data);
        dprintk("NFS reply mkdir: %d\n", status);
        return status;
 }
@@ -615,52 +632,50 @@ static int
 nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
                dev_t rdev)
 {
-        struct nfs_fh fh;
+        struct nfs3_createdata *data;
-        struct nfs_fattr fattr, dir_attr;
-        struct nfs3_mknodargs   arg = {
-                .fh             = NFS_FH(dir),
-                .name           = dentry->d_name.name,
-                .len            = dentry->d_name.len,
-                .sattr          = sattr,
-                .rdev           = rdev
-        };
-        struct nfs3_diropres    res = {
-                .dir_attr       = &dir_attr,
-                .fh             = &fh,
-                .fattr          = &fattr
-        };
-        struct rpc_message msg = {
-                .rpc_proc       = &nfs3_procedures[NFS3PROC_MKNOD],
-                .rpc_argp       = &arg,
-                .rpc_resp       = &res,
-        };
        mode_t mode = sattr->ia_mode;
-        int status;
+        int status = -ENOMEM;
-        switch (sattr->ia_mode & S_IFMT) {
-        case S_IFBLK:   arg.type = NF3BLK;  break;
-        case S_IFCHR:   arg.type = NF3CHR;  break;
-        case S_IFIFO:   arg.type = NF3FIFO; break;
-        case S_IFSOCK:  arg.type = NF3SOCK; break;
-        default:        return -EINVAL;
-        }
        dprintk("NFS call  mknod %s %u:%u\n", dentry->d_name.name,
                        MAJOR(rdev), MINOR(rdev));
        sattr->ia_mode &= ~current->fs->umask;
-        nfs_fattr_init(&dir_attr);
+        data = nfs3_alloc_createdata();
-        nfs_fattr_init(&fattr);
+        if (data == NULL)
-        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
-        nfs_post_op_update_inode(dir, &dir_attr);
-        if (status != 0)
                goto out;
-        status = nfs_instantiate(dentry, &fh, &fattr);
+        data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKNOD];
+        data->arg.mknod.fh = NFS_FH(dir);
+        data->arg.mknod.name = dentry->d_name.name;
+        data->arg.mknod.len = dentry->d_name.len;
+        data->arg.mknod.sattr = sattr;
+        data->arg.mknod.rdev = rdev;
+        switch (sattr->ia_mode & S_IFMT) {
+        case S_IFBLK:
+                data->arg.mknod.type = NF3BLK;
+                break;
+        case S_IFCHR:
+                data->arg.mknod.type = NF3CHR;
+                break;
+        case S_IFIFO:
+                data->arg.mknod.type = NF3FIFO;
+                break;
+        case S_IFSOCK:
+                data->arg.mknod.type = NF3SOCK;
+                break;
+        default:
+                status = -EINVAL;
+                goto out;
+        }
+        status = nfs3_do_create(dir, dentry, data);
        if (status != 0)
                goto out;
        status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode);
 out:
+        nfs3_free_createdata(data);
        dprintk("NFS reply mknod: %d\n", status);
        return status;
 }
@@ -801,8 +816,6 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
        .write_done     = nfs3_write_done,
        .commit_setup   = nfs3_proc_commit_setup,
        .commit_done    = nfs3_commit_done,
-        .file_open      = nfs_open,
-        .file_release   = nfs_release,
        .lock           = nfs3_proc_lock,
        .clear_acl_cache = nfs3_forget_cached_acls,
 };
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 1293e0acd82b..c910413eaeca 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -451,9 +451,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
                /* Save the delegation */
                memcpy(stateid.data, delegation->stateid.data, sizeof(stateid.data));
                rcu_read_unlock();
-                lock_kernel();
                ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode);
-                unlock_kernel();
                if (ret != 0)
                        goto out;
                ret = -EAGAIN;
@@ -1139,8 +1137,9 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, struct path *path, int
        return res;
 }
-static int _nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr,
+static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
-                struct iattr *sattr, struct nfs4_state *state)
+                            struct nfs_fattr *fattr, struct iattr *sattr,
+                            struct nfs4_state *state)
 {
        struct nfs_server *server = NFS_SERVER(inode);
        struct nfs_setattrargs  arg = {
@@ -1154,9 +1153,10 @@ static int _nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr,
                .server         = server,
        };
        struct rpc_message msg = {
-                .rpc_proc       = &nfs4_procedures[NFSPROC4_CLNT_SETATTR],
+                .rpc_proc       = &nfs4_procedures[NFSPROC4_CLNT_SETATTR],
-                .rpc_argp       = &arg,
+                .rpc_argp       = &arg,
-                .rpc_resp       = &res,
+                .rpc_resp       = &res,
+                .rpc_cred       = cred,
        };
        unsigned long timestamp = jiffies;
        int status;
@@ -1166,7 +1166,6 @@ static int _nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr,
        if (nfs4_copy_delegation_stateid(&arg.stateid, inode)) {
                /* Use that stateid */
        } else if (state != NULL) {
-                msg.rpc_cred = state->owner->so_cred;
                nfs4_copy_stateid(&arg.stateid, state, current->files);
        } else
                memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid));
@@ -1177,15 +1176,16 @@ static int _nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr,
        return status;
 }
-static int nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr,
+static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
-                struct iattr *sattr, struct nfs4_state *state)
+                           struct nfs_fattr *fattr, struct iattr *sattr,
+                           struct nfs4_state *state)
 {
        struct nfs_server *server = NFS_SERVER(inode);
        struct nfs4_exception exception = { };
        int err;
        do {
                err = nfs4_handle_exception(server,
-                                _nfs4_do_setattr(inode, fattr, sattr, state),
+                                _nfs4_do_setattr(inode, cred, fattr, sattr, state),
                                &exception);
        } while (exception.retry);
        return err;
@@ -1647,29 +1647,25 @@ static int
 nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
                  struct iattr *sattr)
 {
-        struct rpc_cred *cred;
        struct inode *inode = dentry->d_inode;
-        struct nfs_open_context *ctx;
+        struct rpc_cred *cred = NULL;
        struct nfs4_state *state = NULL;
        int status;
        nfs_fattr_init(fattr);
        
-        cred = rpc_lookup_cred();
-        if (IS_ERR(cred))
-                return PTR_ERR(cred);
        /* Search for an existing open(O_WRITE) file */
-        ctx = nfs_find_open_context(inode, cred, FMODE_WRITE);
+        if (sattr->ia_valid & ATTR_FILE) {
-        if (ctx != NULL)
+                struct nfs_open_context *ctx;
+                ctx = nfs_file_open_context(sattr->ia_file);
+                cred = ctx->cred;
                state = ctx->state;
+        }
-        status = nfs4_do_setattr(inode, fattr, sattr, state);
+        status = nfs4_do_setattr(inode, cred, fattr, sattr, state);
        if (status == 0)
                nfs_setattr_update_inode(inode, sattr);
-        if (ctx != NULL)
-                put_nfs_open_context(ctx);
-        put_rpccred(cred);
        return status;
 }
@@ -1897,17 +1893,16 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
                goto out;
        }
        state = nfs4_do_open(dir, &path, flags, sattr, cred);
-        put_rpccred(cred);
        d_drop(dentry);
        if (IS_ERR(state)) {
                status = PTR_ERR(state);
-                goto out;
+                goto out_putcred;
        }
        d_add(dentry, igrab(state->inode));
        nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
        if (flags & O_EXCL) {
                struct nfs_fattr fattr;
-                status = nfs4_do_setattr(state->inode, &fattr, sattr, state);
+                status = nfs4_do_setattr(state->inode, cred, &fattr, sattr, state);
                if (status == 0)
                        nfs_setattr_update_inode(state->inode, sattr);
                nfs_post_op_update_inode(state->inode, &fattr);
@@ -1916,6 +1911,8 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
                status = nfs4_intent_set_file(nd, &path, state);
        else
                nfs4_close_sync(&path, state, flags);
+out_putcred:
+        put_rpccred(cred);
 out:
        return status;
 }
@@ -2079,47 +2076,81 @@ static int nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *n
        return err;
 }
+struct nfs4_createdata {
+        struct rpc_message msg;
+        struct nfs4_create_arg arg;
+        struct nfs4_create_res res;
+        struct nfs_fh fh;
+        struct nfs_fattr fattr;
+        struct nfs_fattr dir_fattr;
+};
+static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
+                struct qstr *name, struct iattr *sattr, u32 ftype)
+{
+        struct nfs4_createdata *data;
+        data = kzalloc(sizeof(*data), GFP_KERNEL);
+        if (data != NULL) {
+                struct nfs_server *server = NFS_SERVER(dir);
+                data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE];
+                data->msg.rpc_argp = &data->arg;
+                data->msg.rpc_resp = &data->res;
+                data->arg.dir_fh = NFS_FH(dir);
+                data->arg.server = server;
+                data->arg.name = name;
+                data->arg.attrs = sattr;
+                data->arg.ftype = ftype;
+                data->arg.bitmask = server->attr_bitmask;
+                data->res.server = server;
+                data->res.fh = &data->fh;
+                data->res.fattr = &data->fattr;
+                data->res.dir_fattr = &data->dir_fattr;
+                nfs_fattr_init(data->res.fattr);
+                nfs_fattr_init(data->res.dir_fattr);
+        }
+        return data;
+}
+static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data)
+{
+        int status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0);
+        if (status == 0) {
+                update_changeattr(dir, &data->res.dir_cinfo);
+                nfs_post_op_update_inode(dir, data->res.dir_fattr);
+                status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+        }
+        return status;
+}
+static void nfs4_free_createdata(struct nfs4_createdata *data)
+{
+        kfree(data);
+}
 static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
                struct page *page, unsigned int len, struct iattr *sattr)
 {
-        struct nfs_server *server = NFS_SERVER(dir);
+        struct nfs4_createdata *data;
-        struct nfs_fh fhandle;
+        int status = -ENAMETOOLONG;
-        struct nfs_fattr fattr, dir_fattr;
-        struct nfs4_create_arg arg = {
-                .dir_fh = NFS_FH(dir),
-                .server = server,
-                .name = &dentry->d_name,
-                .attrs = sattr,
-                .ftype = NF4LNK,
-                .bitmask = server->attr_bitmask,
-        };
-        struct nfs4_create_res res = {
-                .server = server,
-                .fh = &fhandle,
-                .fattr = &fattr,
-                .dir_fattr = &dir_fattr,
-        };
-        struct rpc_message msg = {
-                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK],
-                .rpc_argp = &arg,
-                .rpc_resp = &res,
-        };
-        int                     status;
        if (len > NFS4_MAXPATHLEN)
-                return -ENAMETOOLONG;
+                goto out;
-        arg.u.symlink.pages = &page;
+        status = -ENOMEM;
-        arg.u.symlink.len = len;
+        data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4LNK);
-        nfs_fattr_init(&fattr);
+        if (data == NULL)
-        nfs_fattr_init(&dir_fattr);
+                goto out;
+        data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK];
+        data->arg.u.symlink.pages = &page;
+        data->arg.u.symlink.len = len;
        
-        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+        status = nfs4_do_create(dir, dentry, data);
-        if (!status) {
-                update_changeattr(dir, &res.dir_cinfo);
+        nfs4_free_createdata(data);
-                nfs_post_op_update_inode(dir, res.dir_fattr);
+out:
-                status = nfs_instantiate(dentry, &fhandle, &fattr);
-        }
        return status;
 }
@@ -2140,39 +2171,17 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
 static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
                struct iattr *sattr)
 {
-        struct nfs_server *server = NFS_SERVER(dir);
+        struct nfs4_createdata *data;
-        struct nfs_fh fhandle;
+        int status = -ENOMEM;
-        struct nfs_fattr fattr, dir_fattr;
-        struct nfs4_create_arg arg = {
-                .dir_fh = NFS_FH(dir),
-                .server = server,
-                .name = &dentry->d_name,
-                .attrs = sattr,
-                .ftype = NF4DIR,
-                .bitmask = server->attr_bitmask,
-        };
-        struct nfs4_create_res res = {
-                .server = server,
-                .fh = &fhandle,
-                .fattr = &fattr,
-                .dir_fattr = &dir_fattr,
-        };
-        struct rpc_message msg = {
-                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE],
-                .rpc_argp = &arg,
-                .rpc_resp = &res,
-        };
-        int                     status;
-        nfs_fattr_init(&fattr);
+        data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4DIR);
-        nfs_fattr_init(&dir_fattr);
+        if (data == NULL)
-        
+                goto out;
-        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
-        if (!status) {
+        status = nfs4_do_create(dir, dentry, data);
-                update_changeattr(dir, &res.dir_cinfo);
-                nfs_post_op_update_inode(dir, res.dir_fattr);
+        nfs4_free_createdata(data);
-                status = nfs_instantiate(dentry, &fhandle, &fattr);
+out:
-        }
        return status;
 }
@@ -2242,56 +2251,34 @@ static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
 static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
                struct iattr *sattr, dev_t rdev)
 {
-        struct nfs_server *server = NFS_SERVER(dir);
+        struct nfs4_createdata *data;
-        struct nfs_fh fh;
+        int mode = sattr->ia_mode;
-        struct nfs_fattr fattr, dir_fattr;
+        int status = -ENOMEM;
-        struct nfs4_create_arg arg = {
-                .dir_fh = NFS_FH(dir),
-                .server = server,
-                .name = &dentry->d_name,
-                .attrs = sattr,
-                .bitmask = server->attr_bitmask,
-        };
-        struct nfs4_create_res res = {
-                .server = server,
-                .fh = &fh,
-                .fattr = &fattr,
-                .dir_fattr = &dir_fattr,
-        };
-        struct rpc_message msg = {
-                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE],
-                .rpc_argp = &arg,
-                .rpc_resp = &res,
-        };
-        int                     status;
-        int                     mode = sattr->ia_mode;
-        nfs_fattr_init(&fattr);
-        nfs_fattr_init(&dir_fattr);
        BUG_ON(!(sattr->ia_valid & ATTR_MODE));
        BUG_ON(!S_ISFIFO(mode) && !S_ISBLK(mode) && !S_ISCHR(mode) && !S_ISSOCK(mode));
+        data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4SOCK);
+        if (data == NULL)
+                goto out;
        if (S_ISFIFO(mode))
-                arg.ftype = NF4FIFO;
+                data->arg.ftype = NF4FIFO;
        else if (S_ISBLK(mode)) {
-                arg.ftype = NF4BLK;
+                data->arg.ftype = NF4BLK;
-                arg.u.device.specdata1 = MAJOR(rdev);
+                data->arg.u.device.specdata1 = MAJOR(rdev);
-                arg.u.device.specdata2 = MINOR(rdev);
+                data->arg.u.device.specdata2 = MINOR(rdev);
        }
        else if (S_ISCHR(mode)) {
-                arg.ftype = NF4CHR;
+                data->arg.ftype = NF4CHR;
-                arg.u.device.specdata1 = MAJOR(rdev);
+                data->arg.u.device.specdata1 = MAJOR(rdev);
-                arg.u.device.specdata2 = MINOR(rdev);
+                data->arg.u.device.specdata2 = MINOR(rdev);
        }
-        else
-                arg.ftype = NF4SOCK;
        
-        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+        status = nfs4_do_create(dir, dentry, data);
-        if (status == 0) {
-                update_changeattr(dir, &res.dir_cinfo);
+        nfs4_free_createdata(data);
-                nfs_post_op_update_inode(dir, res.dir_fattr);
+out:
-                status = nfs_instantiate(dentry, &fh, &fattr);
-        }
        return status;
 }
@@ -2706,6 +2693,8 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen)
        ret = nfs_revalidate_inode(server, inode);
        if (ret < 0)
                return ret;
+        if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL)
+                nfs_zap_acl_cache(inode);
        ret = nfs4_read_cached_acl(inode, buf, buflen);
        if (ret != -ENOENT)
                return ret;
@@ -2733,7 +2722,8 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
        nfs_inode_return_delegation(inode);
        buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
        ret = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
-        nfs_zap_caches(inode);
+        nfs_access_zap_cache(inode);
+        nfs_zap_acl_cache(inode);
        return ret;
 }
@@ -2767,8 +2757,7 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server)
                        task->tk_status = 0;
                        return -EAGAIN;
                case -NFS4ERR_DELAY:
-                        nfs_inc_server_stats((struct nfs_server *) server,
+                        nfs_inc_server_stats(server, NFSIOS_DELAY);
-                                                NFSIOS_DELAY);
                case -NFS4ERR_GRACE:
                        rpc_delay(task, NFS4_POLL_RETRY_MAX);
                        task->tk_status = 0;
@@ -2933,7 +2922,7 @@ static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cre
 int nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred)
 {
-        long timeout;
+        long timeout = 0;
        int err;
        do {
                err = _nfs4_proc_setclientid_confirm(clp, cred);
@@ -3725,8 +3714,6 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
        .write_done     = nfs4_write_done,
        .commit_setup   = nfs4_proc_commit_setup,
        .commit_done    = nfs4_commit_done,
-        .file_open      = nfs_open,
-        .file_release   = nfs_release,
        .lock           = nfs4_proc_lock,
        .clear_acl_cache = nfs4_zap_acl_attr,
 };
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 856a8934f610..401ef8b28f97 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -940,7 +940,6 @@ static int reclaimer(void *ptr)
        allow_signal(SIGKILL);
        /* Ensure exclusive access to NFSv4 state */
-        lock_kernel();
        down_write(&clp->cl_sem);
        /* Are there any NFS mounts out there? */
        if (list_empty(&clp->cl_superblocks))
@@ -1000,7 +999,6 @@ restart_loop:
        nfs_delegation_reap_unclaimed(clp);
 out:
        up_write(&clp->cl_sem);
-        unlock_kernel();
        if (status == -NFS4ERR_CB_PATH_DOWN)
                nfs_handle_cb_pathdown(clp);
        nfs4_clear_recover_bit(clp);
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 531379d36823..46763d1cd397 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -1,6 +1,4 @@
 /*
- *  $Id: nfsroot.c,v 1.45 1998/03/07 10:44:46 mj Exp $
- *
 *  Copyright (C) 1995, 1996  Gero Kuhlmann <gero@gkminix.han.de>
 *
 *  Allow an NFS filesystem to be mounted as root. The way this works is:
@@ -297,10 +295,10 @@ static int __init root_nfs_name(char *name)
        nfs_data.flags    = NFS_MOUNT_NONLM;    /* No lockd in nfs root yet */
        nfs_data.rsize    = NFS_DEF_FILE_IO_SIZE;
        nfs_data.wsize    = NFS_DEF_FILE_IO_SIZE;
-        nfs_data.acregmin = 3;
+        nfs_data.acregmin = NFS_DEF_ACREGMIN;
-        nfs_data.acregmax = 60;
+        nfs_data.acregmax = NFS_DEF_ACREGMAX;
-        nfs_data.acdirmin = 30;
+        nfs_data.acdirmin = NFS_DEF_ACDIRMIN;
-        nfs_data.acdirmax = 60;
+        nfs_data.acdirmax = NFS_DEF_ACDIRMAX;
        strcpy(buf, NFS_ROOT);
        /* Process options received from the remote server */
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 03599bfe81cf..4dbb84df1b68 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -129,6 +129,8 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
        sattr->ia_mode &= S_IALLUGO;
        dprintk("NFS call  setattr\n");
+        if (sattr->ia_valid & ATTR_FILE)
+                msg.rpc_cred = nfs_file_cred(sattr->ia_file);
        nfs_fattr_init(fattr);
        status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
        if (status == 0)
@@ -598,6 +600,29 @@ nfs_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
        return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl);
 }
+/* Helper functions for NFS lock bounds checking */
+#define NFS_LOCK32_OFFSET_MAX ((__s32)0x7fffffffUL)
+static int nfs_lock_check_bounds(const struct file_lock *fl)
+{
+        __s32 start, end;
+        start = (__s32)fl->fl_start;
+        if ((loff_t)start != fl->fl_start)
+                goto out_einval;
+        if (fl->fl_end != OFFSET_MAX) {
+                end = (__s32)fl->fl_end;
+                if ((loff_t)end != fl->fl_end)
+                        goto out_einval;
+        } else
+                end = NFS_LOCK32_OFFSET_MAX;
+        if (start < 0 || start > end)
+                goto out_einval;
+        return 0;
+out_einval:
+        return -EINVAL;
+}
 const struct nfs_rpc_ops nfs_v2_clientops = {
        .version        = 2,                   /* protocol version */
@@ -630,7 +655,6 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
        .write_setup    = nfs_proc_write_setup,
        .write_done     = nfs_write_done,
        .commit_setup   = nfs_proc_commit_setup,
-        .file_open      = nfs_open,
-        .file_release   = nfs_release,
        .lock           = nfs_proc_lock,
+        .lock_check_bounds = nfs_lock_check_bounds,
 };
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 2a4a024a4e7b..1b94e3650f5c 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -47,6 +47,7 @@
 #include <linux/inet.h>
 #include <linux/in6.h>
 #include <net/ipv6.h>
+#include <linux/netdevice.h>
 #include <linux/nfs_xdr.h>
 #include <linux/magic.h>
 #include <linux/parser.h>
@@ -65,7 +66,6 @@
 enum {
        /* Mount options that take no arguments */
        Opt_soft, Opt_hard,
-        Opt_intr, Opt_nointr,
        Opt_posix, Opt_noposix,
        Opt_cto, Opt_nocto,
        Opt_ac, Opt_noac,
@@ -92,8 +92,8 @@ enum {
        Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost,
        Opt_addr, Opt_mountaddr, Opt_clientaddr,
-        /* Mount options that are ignored */
+        /* Special mount options */
-        Opt_userspace, Opt_deprecated,
+        Opt_userspace, Opt_deprecated, Opt_sloppy,
        Opt_err
 };
@@ -101,10 +101,14 @@ enum {
 static match_table_t nfs_mount_option_tokens = {
        { Opt_userspace, "bg" },
        { Opt_userspace, "fg" },
+        { Opt_userspace, "retry=%s" },
+        { Opt_sloppy, "sloppy" },
        { Opt_soft, "soft" },
        { Opt_hard, "hard" },
-        { Opt_intr, "intr" },
+        { Opt_deprecated, "intr" },
-        { Opt_nointr, "nointr" },
+        { Opt_deprecated, "nointr" },
        { Opt_posix, "posix" },
        { Opt_noposix, "noposix" },
        { Opt_cto, "cto" },
@@ -136,7 +140,6 @@ static match_table_t nfs_mount_option_tokens = {
        { Opt_acdirmin, "acdirmin=%u" },
        { Opt_acdirmax, "acdirmax=%u" },
        { Opt_actimeo, "actimeo=%u" },
-        { Opt_userspace, "retry=%u" },
        { Opt_namelen, "namlen=%u" },
        { Opt_mountport, "mountport=%u" },
        { Opt_mountvers, "mountvers=%u" },
@@ -207,6 +210,7 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type,
                int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
 static void nfs_kill_super(struct super_block *);
 static void nfs_put_super(struct super_block *);
+static int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
 static struct file_system_type nfs_fs_type = {
        .owner          = THIS_MODULE,
@@ -234,6 +238,7 @@ static const struct super_operations nfs_sops = {
        .umount_begin   = nfs_umount_begin,
        .show_options   = nfs_show_options,
        .show_stats     = nfs_show_stats,
+        .remount_fs     = nfs_remount,
 };
 #ifdef CONFIG_NFS_V4
@@ -278,6 +283,7 @@ static const struct super_operations nfs4_sops = {
        .umount_begin   = nfs_umount_begin,
        .show_options   = nfs_show_options,
        .show_stats     = nfs_show_stats,
+        .remount_fs     = nfs_remount,
 };
 #endif
@@ -368,8 +374,6 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        };
        int error;
-        lock_kernel();
        error = server->nfs_client->rpc_ops->statfs(server, fh, &res);
        if (error < 0)
                goto out_err;
@@ -401,12 +405,10 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_namelen = server->namelen;
-        unlock_kernel();
        return 0;
 out_err:
        dprintk("%s: statfs error = %d\n", __func__, -error);
-        unlock_kernel();
        return error;
 }
@@ -514,13 +516,13 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
        if (nfss->bsize != 0)
                seq_printf(m, ",bsize=%u", nfss->bsize);
        seq_printf(m, ",namlen=%u", nfss->namelen);
-        if (nfss->acregmin != 3*HZ || showdefaults)
+        if (nfss->acregmin != NFS_DEF_ACREGMIN*HZ || showdefaults)
                seq_printf(m, ",acregmin=%u", nfss->acregmin/HZ);
-        if (nfss->acregmax != 60*HZ || showdefaults)
+        if (nfss->acregmax != NFS_DEF_ACREGMAX*HZ || showdefaults)
                seq_printf(m, ",acregmax=%u", nfss->acregmax/HZ);
-        if (nfss->acdirmin != 30*HZ || showdefaults)
+        if (nfss->acdirmin != NFS_DEF_ACDIRMIN*HZ || showdefaults)
                seq_printf(m, ",acdirmin=%u", nfss->acdirmin/HZ);
-        if (nfss->acdirmax != 60*HZ || showdefaults)
+        if (nfss->acdirmax != NFS_DEF_ACDIRMAX*HZ || showdefaults)
                seq_printf(m, ",acdirmax=%u", nfss->acdirmax/HZ);
        for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) {
                if (nfss->flags & nfs_infop->flag)
@@ -702,49 +704,233 @@ static int nfs_verify_server_address(struct sockaddr *addr)
        return 0;
 }
+static void nfs_parse_ipv4_address(char *string, size_t str_len,
+                                   struct sockaddr *sap, size_t *addr_len)
+{
+        struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+        u8 *addr = (u8 *)&sin->sin_addr.s_addr;
+        if (str_len <= INET_ADDRSTRLEN) {
+                dfprintk(MOUNT, "NFS: parsing IPv4 address %*s\n",
+                                (int)str_len, string);
+                sin->sin_family = AF_INET;
+                *addr_len = sizeof(*sin);
+                if (in4_pton(string, str_len, addr, '\0', NULL))
+                        return;
+        }
+        sap->sa_family = AF_UNSPEC;
+        *addr_len = 0;
+}
+#define IPV6_SCOPE_DELIMITER    '%'
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static void nfs_parse_ipv6_scope_id(const char *string, const size_t str_len,
+                                    const char *delim,
+                                    struct sockaddr_in6 *sin6)
+{
+        char *p;
+        size_t len;
+        if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL))
+                return ;
+        if (*delim != IPV6_SCOPE_DELIMITER)
+                return;
+        len = (string + str_len) - delim - 1;
+        p = kstrndup(delim + 1, len, GFP_KERNEL);
+        if (p) {
+                unsigned long scope_id = 0;
+                struct net_device *dev;
+                dev = dev_get_by_name(&init_net, p);
+                if (dev != NULL) {
+                        scope_id = dev->ifindex;
+                        dev_put(dev);
+                } else {
+                        /* scope_id is set to zero on error */
+                        strict_strtoul(p, 10, &scope_id);
+                }
+                kfree(p);
+                sin6->sin6_scope_id = scope_id;
+                dfprintk(MOUNT, "NFS: IPv6 scope ID = %lu\n", scope_id);
+        }
+}
+static void nfs_parse_ipv6_address(char *string, size_t str_len,
+                                   struct sockaddr *sap, size_t *addr_len)
+{
+        struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+        u8 *addr = (u8 *)&sin6->sin6_addr.in6_u;
+        const char *delim;
+        if (str_len <= INET6_ADDRSTRLEN) {
+                dfprintk(MOUNT, "NFS: parsing IPv6 address %*s\n",
+                                (int)str_len, string);
+                sin6->sin6_family = AF_INET6;
+                *addr_len = sizeof(*sin6);
+                if (in6_pton(string, str_len, addr, IPV6_SCOPE_DELIMITER, &delim)) {
+                        nfs_parse_ipv6_scope_id(string, str_len, delim, sin6);
+                        return;
+                }
+        }
+        sap->sa_family = AF_UNSPEC;
+        *addr_len = 0;
+}
+#else
+static void nfs_parse_ipv6_address(char *string, size_t str_len,
+                                   struct sockaddr *sap, size_t *addr_len)
+{
+        sap->sa_family = AF_UNSPEC;
+        *addr_len = 0;
+}
+#endif
 /*
- * Parse string addresses passed in via a mount option,
+ * Construct a sockaddr based on the contents of a string that contains
- * and construct a sockaddr based on the result.
+ * an IP address in presentation format.
 *
- * If address parsing fails, set the sockaddr's address
+ * If there is a problem constructing the new sockaddr, set the address
- * family to AF_UNSPEC to force nfs_verify_server_address()
+ * family to AF_UNSPEC.
- * to punt the mount.
 */
-static void nfs_parse_server_address(char *value,
+static void nfs_parse_ip_address(char *string, size_t str_len,
-                                     struct sockaddr *sap,
+                                 struct sockaddr *sap, size_t *addr_len)
-                                     size_t *len)
 {
-        if (strchr(value, ':')) {
+        unsigned int i, colons;
-                struct sockaddr_in6 *ap = (struct sockaddr_in6 *)sap;
-                u8 *addr = (u8 *)&ap->sin6_addr.in6_u;
-                ap->sin6_family = AF_INET6;
+        colons = 0;
-                *len = sizeof(*ap);
+        for (i = 0; i < str_len; i++)
-                if (in6_pton(value, -1, addr, '\0', NULL))
+                if (string[i] == ':')
-                        return;
+                        colons++;
-        } else {
-                struct sockaddr_in *ap = (struct sockaddr_in *)sap;
-                u8 *addr = (u8 *)&ap->sin_addr.s_addr;
-                ap->sin_family = AF_INET;
+        if (colons >= 2)
-                *len = sizeof(*ap);
+                nfs_parse_ipv6_address(string, str_len, sap, addr_len);
-                if (in4_pton(value, -1, addr, '\0', NULL))
+        else
+                nfs_parse_ipv4_address(string, str_len, sap, addr_len);
+}
+/*
+ * Sanity check the NFS transport protocol.
+ *
+ */
+static void nfs_validate_transport_protocol(struct nfs_parsed_mount_data *mnt)
+{
+        switch (mnt->nfs_server.protocol) {
+        case XPRT_TRANSPORT_UDP:
+        case XPRT_TRANSPORT_TCP:
+        case XPRT_TRANSPORT_RDMA:
+                break;
+        default:
+                mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
+        }
+}
+/*
+ * For text based NFSv2/v3 mounts, the mount protocol transport default
+ * settings should depend upon the specified NFS transport.
+ */
+static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt)
+{
+        nfs_validate_transport_protocol(mnt);
+        if (mnt->mount_server.protocol == XPRT_TRANSPORT_UDP ||
+            mnt->mount_server.protocol == XPRT_TRANSPORT_TCP)
                        return;
+        switch (mnt->nfs_server.protocol) {
+        case XPRT_TRANSPORT_UDP:
+                mnt->mount_server.protocol = XPRT_TRANSPORT_UDP;
+                break;
+        case XPRT_TRANSPORT_TCP:
+        case XPRT_TRANSPORT_RDMA:
+                mnt->mount_server.protocol = XPRT_TRANSPORT_TCP;
        }
+}
-        sap->sa_family = AF_UNSPEC;
+/*
-        *len = 0;
+ * Parse the value of the 'sec=' option.
+ *
+ * The flavor_len setting is for v4 mounts.
+ */
+static int nfs_parse_security_flavors(char *value,
+                                      struct nfs_parsed_mount_data *mnt)
+{
+        substring_t args[MAX_OPT_ARGS];
+        dfprintk(MOUNT, "NFS: parsing sec=%s option\n", value);
+        switch (match_token(value, nfs_secflavor_tokens, args)) {
+        case Opt_sec_none:
+                mnt->auth_flavor_len = 0;
+                mnt->auth_flavors[0] = RPC_AUTH_NULL;
+                break;
+        case Opt_sec_sys:
+                mnt->auth_flavor_len = 0;
+                mnt->auth_flavors[0] = RPC_AUTH_UNIX;
+                break;
+        case Opt_sec_krb5:
+                mnt->auth_flavor_len = 1;
+                mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5;
+                break;
+        case Opt_sec_krb5i:
+                mnt->auth_flavor_len = 1;
+                mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I;
+                break;
+        case Opt_sec_krb5p:
+                mnt->auth_flavor_len = 1;
+                mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P;
+                break;
+        case Opt_sec_lkey:
+                mnt->auth_flavor_len = 1;
+                mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY;
+                break;
+        case Opt_sec_lkeyi:
+                mnt->auth_flavor_len = 1;
+                mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI;
+                break;
+        case Opt_sec_lkeyp:
+                mnt->auth_flavor_len = 1;
+                mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP;
+                break;
+        case Opt_sec_spkm:
+                mnt->auth_flavor_len = 1;
+                mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM;
+                break;
+        case Opt_sec_spkmi:
+                mnt->auth_flavor_len = 1;
+                mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI;
+                break;
+        case Opt_sec_spkmp:
+                mnt->auth_flavor_len = 1;
+                mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP;
+                break;
+        default:
+                return 0;
+        }
+        return 1;
+}
+static void nfs_parse_invalid_value(const char *option)
+{
+        dfprintk(MOUNT, "NFS:   bad value specified for %s option\n", option);
 }
 /*
 * Error-check and convert a string of mount options from user space into
- * a data structure
+ * a data structure.  The whole mount string is processed; bad options are
+ * skipped as they are encountered.  If there were no errors, return 1;
+ * otherwise return 0 (zero).
 */
 static int nfs_parse_mount_options(char *raw,
                                   struct nfs_parsed_mount_data *mnt)
 {
        char *p, *string, *secdata;
-        int rc;
+        int rc, sloppy = 0, errors = 0;
        if (!raw) {
                dfprintk(MOUNT, "NFS: mount options string was NULL.\n");
@@ -777,15 +963,16 @@ static int nfs_parse_mount_options(char *raw,
                token = match_token(p, nfs_mount_option_tokens, args);
                switch (token) {
+                /*
+                 * boolean options:  foo/nofoo
+                 */
                case Opt_soft:
                        mnt->flags |= NFS_MOUNT_SOFT;
                        break;
                case Opt_hard:
                        mnt->flags &= ~NFS_MOUNT_SOFT;
                        break;
-                case Opt_intr:
-                case Opt_nointr:
-                        break;
                case Opt_posix:
                        mnt->flags |= NFS_MOUNT_POSIX;
                        break;
@@ -819,20 +1006,14 @@ static int nfs_parse_mount_options(char *raw,
                case Opt_udp:
                        mnt->flags &= ~NFS_MOUNT_TCP;
                        mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
-                        mnt->timeo = 7;
-                        mnt->retrans = 5;
                        break;
                case Opt_tcp:
                        mnt->flags |= NFS_MOUNT_TCP;
                        mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
-                        mnt->timeo = 600;
-                        mnt->retrans = 2;
                        break;
                case Opt_rdma:
                        mnt->flags |= NFS_MOUNT_TCP; /* for side protocols */
                        mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
-                        mnt->timeo = 600;
-                        mnt->retrans = 2;
                        break;
                case Opt_acl:
                        mnt->flags &= ~NFS_MOUNT_NOACL;
@@ -853,165 +1034,144 @@ static int nfs_parse_mount_options(char *raw,
                        mnt->flags |= NFS_MOUNT_UNSHARED;
                        break;
+                /*
+                 * options that take numeric values
+                 */
                case Opt_port:
-                        if (match_int(args, &option))
+                        if (match_int(args, &option) ||
-                                return 0;
+                            option < 0 || option > USHORT_MAX) {
-                        if (option < 0 || option > 65535)
+                                errors++;
-                                return 0;
+                                nfs_parse_invalid_value("port");
-                        mnt->nfs_server.port = option;
+                        } else
+                                mnt->nfs_server.port = option;
                        break;
                case Opt_rsize:
-                        if (match_int(args, &mnt->rsize))
+                        if (match_int(args, &option) || option < 0) {
-                                return 0;
+                                errors++;
+                                nfs_parse_invalid_value("rsize");
+                        } else
+                                mnt->rsize = option;
                        break;
                case Opt_wsize:
-                        if (match_int(args, &mnt->wsize))
+                        if (match_int(args, &option) || option < 0) {
-                                return 0;
+                                errors++;
+                                nfs_parse_invalid_value("wsize");
+                        } else
+                                mnt->wsize = option;
                        break;
                case Opt_bsize:
-                        if (match_int(args, &option))
+                        if (match_int(args, &option) || option < 0) {
-                                return 0;
+                                errors++;
-                        if (option < 0)
+                                nfs_parse_invalid_value("bsize");
-                                return 0;
+                        } else
-                        mnt->bsize = option;
+                                mnt->bsize = option;
                        break;
                case Opt_timeo:
-                        if (match_int(args, &mnt->timeo))
+                        if (match_int(args, &option) || option <= 0) {
-                                return 0;
+                                errors++;
+                                nfs_parse_invalid_value("timeo");
+                        } else
+                                mnt->timeo = option;
                        break;
                case Opt_retrans:
-                        if (match_int(args, &mnt->retrans))
+                        if (match_int(args, &option) || option <= 0) {
-                                return 0;
+                                errors++;
+                                nfs_parse_invalid_value("retrans");
+                        } else
+                                mnt->retrans = option;
                        break;
                case Opt_acregmin:
-                        if (match_int(args, &mnt->acregmin))
+                        if (match_int(args, &option) || option < 0) {
-                                return 0;
+                                errors++;
+                                nfs_parse_invalid_value("acregmin");
+                        } else
+                                mnt->acregmin = option;
                        break;
                case Opt_acregmax:
-                        if (match_int(args, &mnt->acregmax))
+                        if (match_int(args, &option) || option < 0) {
-                                return 0;
+                                errors++;
+                                nfs_parse_invalid_value("acregmax");
+                        } else
+                                mnt->acregmax = option;
                        break;
                case Opt_acdirmin:
-                        if (match_int(args, &mnt->acdirmin))
+                        if (match_int(args, &option) || option < 0) {
-                                return 0;
+                                errors++;
+                                nfs_parse_invalid_value("acdirmin");
+                        } else
+                                mnt->acdirmin = option;
                        break;
                case Opt_acdirmax:
-                        if (match_int(args, &mnt->acdirmax))
+                        if (match_int(args, &option) || option < 0) {
-                                return 0;
+                                errors++;
+                                nfs_parse_invalid_value("acdirmax");
+                        } else
+                                mnt->acdirmax = option;
                        break;
                case Opt_actimeo:
-                        if (match_int(args, &option))
+                        if (match_int(args, &option) || option < 0) {
-                                return 0;
+                                errors++;
-                        if (option < 0)
+                                nfs_parse_invalid_value("actimeo");
-                                return 0;
+                        } else
-                        mnt->acregmin =
+                                mnt->acregmin = mnt->acregmax =
-                        mnt->acregmax =
+                                mnt->acdirmin = mnt->acdirmax = option;
-                        mnt->acdirmin =
-                        mnt->acdirmax = option;
                        break;
                case Opt_namelen:
-                        if (match_int(args, &mnt->namlen))
+                        if (match_int(args, &option) || option < 0) {
-                                return 0;
+                                errors++;
+                                nfs_parse_invalid_value("namlen");
+                        } else
+                                mnt->namlen = option;
                        break;
                case Opt_mountport:
-                        if (match_int(args, &option))
+                        if (match_int(args, &option) ||
-                                return 0;
+                            option < 0 || option > USHORT_MAX) {
-                        if (option < 0 || option > 65535)
+                                errors++;
-                                return 0;
+                                nfs_parse_invalid_value("mountport");
-                        mnt->mount_server.port = option;
+                        } else
+                                mnt->mount_server.port = option;
                        break;
                case Opt_mountvers:
-                        if (match_int(args, &option))
+                        if (match_int(args, &option) ||
-                                return 0;
+                            option < NFS_MNT_VERSION ||
-                        if (option < 0)
+                            option > NFS_MNT3_VERSION) {
-                                return 0;
+                                errors++;
-                        mnt->mount_server.version = option;
+                                nfs_parse_invalid_value("mountvers");
+                        } else
+                                mnt->mount_server.version = option;
                        break;
                case Opt_nfsvers:
-                        if (match_int(args, &option))
+                        if (match_int(args, &option)) {
-                                return 0;
+                                errors++;
+                                nfs_parse_invalid_value("nfsvers");
+                                break;
+                        }
                        switch (option) {
-                        case 2:
+                        case NFS2_VERSION:
                                mnt->flags &= ~NFS_MOUNT_VER3;
                                break;
-                        case 3:
+                        case NFS3_VERSION:
                                mnt->flags |= NFS_MOUNT_VER3;
                                break;
                        default:
-                                goto out_unrec_vers;
+                                errors++;
+                                nfs_parse_invalid_value("nfsvers");
                        }
                        break;
+                /*
+                 * options that take text values
+                 */
                case Opt_sec:
                        string = match_strdup(args);
                        if (string == NULL)
                                goto out_nomem;
-                        token = match_token(string, nfs_secflavor_tokens, args);
+                        rc = nfs_parse_security_flavors(string, mnt);
                        kfree(string);
+                        if (!rc) {
-                        /*
+                                errors++;
-                         * The flags setting is for v2/v3.  The flavor_len
+                                dfprintk(MOUNT, "NFS:   unrecognized "
-                         * setting is for v4.  v2/v3 also need to know the
+                                                "security flavor\n");
-                         * difference between NULL and UNIX.
-                         */
-                        switch (token) {
-                        case Opt_sec_none:
-                                mnt->flags &= ~NFS_MOUNT_SECFLAVOUR;
-                                mnt->auth_flavor_len = 0;
-                                mnt->auth_flavors[0] = RPC_AUTH_NULL;
-                                break;
-                        case Opt_sec_sys:
-                                mnt->flags &= ~NFS_MOUNT_SECFLAVOUR;
-                                mnt->auth_flavor_len = 0;
-                                mnt->auth_flavors[0] = RPC_AUTH_UNIX;
-                                break;
-                        case Opt_sec_krb5:
-                                mnt->flags |= NFS_MOUNT_SECFLAVOUR;
-                                mnt->auth_flavor_len = 1;
-                                mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5;
-                                break;
-                        case Opt_sec_krb5i:
-                                mnt->flags |= NFS_MOUNT_SECFLAVOUR;
-                                mnt->auth_flavor_len = 1;
-                                mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I;
-                                break;
-                        case Opt_sec_krb5p:
-                                mnt->flags |= NFS_MOUNT_SECFLAVOUR;
-                                mnt->auth_flavor_len = 1;
-                                mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P;
-                                break;
-                        case Opt_sec_lkey:
-                                mnt->flags |= NFS_MOUNT_SECFLAVOUR;
-                                mnt->auth_flavor_len = 1;
-                                mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY;
-                                break;
-                        case Opt_sec_lkeyi:
-                                mnt->flags |= NFS_MOUNT_SECFLAVOUR;
-                                mnt->auth_flavor_len = 1;
-                                mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI;
-                                break;
-                        case Opt_sec_lkeyp:
-                                mnt->flags |= NFS_MOUNT_SECFLAVOUR;
-                                mnt->auth_flavor_len = 1;
-                                mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP;
-                                break;
-                        case Opt_sec_spkm:
-                                mnt->flags |= NFS_MOUNT_SECFLAVOUR;
-                                mnt->auth_flavor_len = 1;
-                                mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM;
-                                break;
-                        case Opt_sec_spkmi:
-                                mnt->flags |= NFS_MOUNT_SECFLAVOUR;
-                                mnt->auth_flavor_len = 1;
-                                mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI;
-                                break;
-                        case Opt_sec_spkmp:
-                                mnt->flags |= NFS_MOUNT_SECFLAVOUR;
-                                mnt->auth_flavor_len = 1;
-                                mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP;
-                                break;
-                        default:
-                                goto out_unrec_sec;
                        }
                        break;
                case Opt_proto:
@@ -1026,24 +1186,20 @@ static int nfs_parse_mount_options(char *raw,
                        case Opt_xprt_udp:
                                mnt->flags &= ~NFS_MOUNT_TCP;
                                mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
-                                mnt->timeo = 7;
-                                mnt->retrans = 5;
                                break;
                        case Opt_xprt_tcp:
                                mnt->flags |= NFS_MOUNT_TCP;
                                mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
-                                mnt->timeo = 600;
-                                mnt->retrans = 2;
                                break;
                        case Opt_xprt_rdma:
                                /* vector side protocols to TCP */
                                mnt->flags |= NFS_MOUNT_TCP;
                                mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
-                                mnt->timeo = 600;
-                                mnt->retrans = 2;
                                break;
                        default:
-                                goto out_unrec_xprt;
+                                errors++;
+                                dfprintk(MOUNT, "NFS:   unrecognized "
+                                                "transport protocol\n");
                        }
                        break;
                case Opt_mountproto:
@@ -1063,16 +1219,19 @@ static int nfs_parse_mount_options(char *raw,
                                break;
                        case Opt_xprt_rdma: /* not used for side protocols */
                        default:
-                                goto out_unrec_xprt;
+                                errors++;
+                                dfprintk(MOUNT, "NFS:   unrecognized "
+                                                "transport protocol\n");
                        }
                        break;
                case Opt_addr:
                        string = match_strdup(args);
                        if (string == NULL)
                                goto out_nomem;
-                        nfs_parse_server_address(string, (struct sockaddr *)
+                        nfs_parse_ip_address(string, strlen(string),
-                                                 &mnt->nfs_server.address,
+                                             (struct sockaddr *)
-                                                 &mnt->nfs_server.addrlen);
+                                                &mnt->nfs_server.address,
+                                             &mnt->nfs_server.addrlen);
                        kfree(string);
                        break;
                case Opt_clientaddr:
@@ -1093,24 +1252,33 @@ static int nfs_parse_mount_options(char *raw,
                        string = match_strdup(args);
                        if (string == NULL)
                                goto out_nomem;
-                        nfs_parse_server_address(string, (struct sockaddr *)
+                        nfs_parse_ip_address(string, strlen(string),
-                                                 &mnt->mount_server.address,
+                                             (struct sockaddr *)
-                                                 &mnt->mount_server.addrlen);
+                                                &mnt->mount_server.address,
+                                             &mnt->mount_server.addrlen);
                        kfree(string);
                        break;
+                /*
+                 * Special options
+                 */
+                case Opt_sloppy:
+                        sloppy = 1;
+                        dfprintk(MOUNT, "NFS:   relaxing parsing rules\n");
+                        break;
                case Opt_userspace:
                case Opt_deprecated:
+                        dfprintk(MOUNT, "NFS:   ignoring mount option "
+                                        "'%s'\n", p);
                        break;
                default:
-                        goto out_unknown;
+                        errors++;
+                        dfprintk(MOUNT, "NFS:   unrecognized mount option "
+                                        "'%s'\n", p);
                }
        }
-        nfs_set_port((struct sockaddr *)&mnt->nfs_server.address,
-                                mnt->nfs_server.port);
        return 1;
 out_nomem:
@@ -1120,21 +1288,6 @@ out_security_failure:
        free_secdata(secdata);
        printk(KERN_INFO "NFS: security options invalid: %d\n", rc);
        return 0;
-out_unrec_vers:
-        printk(KERN_INFO "NFS: unrecognized NFS version number\n");
-        return 0;
-out_unrec_xprt:
-        printk(KERN_INFO "NFS: unrecognized transport protocol\n");
-        return 0;
-out_unrec_sec:
-        printk(KERN_INFO "NFS: unrecognized security flavor\n");
-        return 0;
-out_unknown:
-        printk(KERN_INFO "NFS: unknown mount option: %s\n", p);
-        return 0;
 }
 /*
@@ -1188,11 +1341,146 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
        if (status == 0)
                return 0;
-        dfprintk(MOUNT, "NFS: unable to mount server %s, error %d",
+        dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n",
                        hostname, status);
        return status;
 }
+static int nfs_parse_simple_hostname(const char *dev_name,
+                                     char **hostname, size_t maxnamlen,
+                                     char **export_path, size_t maxpathlen)
+{
+        size_t len;
+        char *colon, *comma;
+        colon = strchr(dev_name, ':');
+        if (colon == NULL)
+                goto out_bad_devname;
+        len = colon - dev_name;
+        if (len > maxnamlen)
+                goto out_hostname;
+        /* N.B. caller will free nfs_server.hostname in all cases */
+        *hostname = kstrndup(dev_name, len, GFP_KERNEL);
+        if (!*hostname)
+                goto out_nomem;
+        /* kill possible hostname list: not supported */
+        comma = strchr(*hostname, ',');
+        if (comma != NULL) {
+                if (comma == *hostname)
+                        goto out_bad_devname;
+                *comma = '\0';
+        }
+        colon++;
+        len = strlen(colon);
+        if (len > maxpathlen)
+                goto out_path;
+        *export_path = kstrndup(colon, len, GFP_KERNEL);
+        if (!*export_path)
+                goto out_nomem;
+        dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *export_path);
+        return 0;
+out_bad_devname:
+        dfprintk(MOUNT, "NFS: device name not in host:path format\n");
+        return -EINVAL;
+out_nomem:
+        dfprintk(MOUNT, "NFS: not enough memory to parse device name\n");
+        return -ENOMEM;
+out_hostname:
+        dfprintk(MOUNT, "NFS: server hostname too long\n");
+        return -ENAMETOOLONG;
+out_path:
+        dfprintk(MOUNT, "NFS: export pathname too long\n");
+        return -ENAMETOOLONG;
+}
+/*
+ * Hostname has square brackets around it because it contains one or
+ * more colons.  We look for the first closing square bracket, and a
+ * colon must follow it.
+ */
+static int nfs_parse_protected_hostname(const char *dev_name,
+                                        char **hostname, size_t maxnamlen,
+                                        char **export_path, size_t maxpathlen)
+{
+        size_t len;
+        char *start, *end;
+        start = (char *)(dev_name + 1);
+        end = strchr(start, ']');
+        if (end == NULL)
+                goto out_bad_devname;
+        if (*(end + 1) != ':')
+                goto out_bad_devname;
+        len = end - start;
+        if (len > maxnamlen)
+                goto out_hostname;
+        /* N.B. caller will free nfs_server.hostname in all cases */
+        *hostname = kstrndup(start, len, GFP_KERNEL);
+        if (*hostname == NULL)
+                goto out_nomem;
+        end += 2;
+        len = strlen(end);
+        if (len > maxpathlen)
+                goto out_path;
+        *export_path = kstrndup(end, len, GFP_KERNEL);
+        if (!*export_path)
+                goto out_nomem;
+        return 0;
+out_bad_devname:
+        dfprintk(MOUNT, "NFS: device name not in host:path format\n");
+        return -EINVAL;
+out_nomem:
+        dfprintk(MOUNT, "NFS: not enough memory to parse device name\n");
+        return -ENOMEM;
+out_hostname:
+        dfprintk(MOUNT, "NFS: server hostname too long\n");
+        return -ENAMETOOLONG;
+out_path:
+        dfprintk(MOUNT, "NFS: export pathname too long\n");
+        return -ENAMETOOLONG;
+}
+/*
+ * Split "dev_name" into "hostname:export_path".
+ *
+ * The leftmost colon demarks the split between the server's hostname
+ * and the export path.  If the hostname starts with a left square
+ * bracket, then it may contain colons.
+ *
+ * Note: caller frees hostname and export path, even on error.
+ */
+static int nfs_parse_devname(const char *dev_name,
+                             char **hostname, size_t maxnamlen,
+                             char **export_path, size_t maxpathlen)
+{
+        if (*dev_name == '[')
+                return nfs_parse_protected_hostname(dev_name,
+                                                    hostname, maxnamlen,
+                                                    export_path, maxpathlen);
+        return nfs_parse_simple_hostname(dev_name,
+                                         hostname, maxnamlen,
+                                         export_path, maxpathlen);
+}
 /*
 * Validate the NFS2/NFS3 mount data
 * - fills in the mount root filehandle
@@ -1216,24 +1504,20 @@ static int nfs_validate_mount_data(void *options,
 {
        struct nfs_mount_data *data = (struct nfs_mount_data *)options;
-        memset(args, 0, sizeof(*args));
        if (data == NULL)
                goto out_no_data;
        args->flags             = (NFS_MOUNT_VER3 | NFS_MOUNT_TCP);
        args->rsize             = NFS_MAX_FILE_IO_SIZE;
        args->wsize             = NFS_MAX_FILE_IO_SIZE;
-        args->timeo             = 600;
+        args->acregmin          = NFS_DEF_ACREGMIN;
-        args->retrans           = 2;
+        args->acregmax          = NFS_DEF_ACREGMAX;
-        args->acregmin          = 3;
+        args->acdirmin          = NFS_DEF_ACDIRMIN;
-        args->acregmax          = 60;
+        args->acdirmax          = NFS_DEF_ACDIRMAX;
-        args->acdirmin          = 30;
-        args->acdirmax          = 60;
        args->mount_server.port = 0;    /* autobind unless user sets port */
-        args->mount_server.protocol = XPRT_TRANSPORT_UDP;
        args->nfs_server.port   = 0;    /* autobind unless user sets port */
        args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
+        args->auth_flavors[0]   = RPC_AUTH_UNIX;
        switch (data->version) {
        case 1:
@@ -1251,13 +1535,13 @@ static int nfs_validate_mount_data(void *options,
        case 5:
                memset(data->context, 0, sizeof(data->context));
        case 6:
-                if (data->flags & NFS_MOUNT_VER3)
+                if (data->flags & NFS_MOUNT_VER3) {
+                        if (data->root.size > NFS3_FHSIZE || data->root.size == 0)
+                                goto out_invalid_fh;
                        mntfh->size = data->root.size;
-                else
+                } else
                        mntfh->size = NFS2_FHSIZE;
-                if (mntfh->size > sizeof(mntfh->data))
-                        goto out_invalid_fh;
                memcpy(mntfh->data, data->root.data, mntfh->size);
                if (mntfh->size < sizeof(mntfh->data))
@@ -1291,7 +1575,9 @@ static int nfs_validate_mount_data(void *options,
                args->nfs_server.hostname = kstrdup(data->hostname, GFP_KERNEL);
                args->namlen            = data->namlen;
                args->bsize             = data->bsize;
-                args->auth_flavors[0]   = data->pseudoflavor;
+                if (data->flags & NFS_MOUNT_SECFLAVOUR)
+                        args->auth_flavors[0] = data->pseudoflavor;
                if (!args->nfs_server.hostname)
                        goto out_nomem;
@@ -1323,8 +1609,6 @@ static int nfs_validate_mount_data(void *options,
                break;
        default: {
-                unsigned int len;
-                char *c;
                int status;
                if (nfs_parse_mount_options((char *)options, args) == 0)
@@ -1334,21 +1618,22 @@ static int nfs_validate_mount_data(void *options,
                                                &args->nfs_server.address))
                        goto out_no_address;
-                c = strchr(dev_name, ':');
+                nfs_set_port((struct sockaddr *)&args->nfs_server.address,
-                if (c == NULL)
+                                args->nfs_server.port);
-                        return -EINVAL;
-                len = c - dev_name;
-                /* N.B. caller will free nfs_server.hostname in all cases */
-                args->nfs_server.hostname = kstrndup(dev_name, len, GFP_KERNEL);
-                if (!args->nfs_server.hostname)
-                        goto out_nomem;
-                c++;
+                nfs_set_mount_transport_protocol(args);
-                if (strlen(c) > NFS_MAXPATHLEN)
-                        return -ENAMETOOLONG;
+                status = nfs_parse_devname(dev_name,
-                args->nfs_server.export_path = c;
+                                           &args->nfs_server.hostname,
+                                           PAGE_SIZE,
+                                           &args->nfs_server.export_path,
+                                           NFS_MAXPATHLEN);
+                if (!status)
+                        status = nfs_try_mount(args, mntfh);
+                kfree(args->nfs_server.export_path);
+                args->nfs_server.export_path = NULL;
-                status = nfs_try_mount(args, mntfh);
                if (status)
                        return status;
@@ -1356,9 +1641,6 @@ static int nfs_validate_mount_data(void *options,
                }
        }
-        if (!(args->flags & NFS_MOUNT_SECFLAVOUR))
-                args->auth_flavors[0] = RPC_AUTH_UNIX;
 #ifndef CONFIG_NFS_V3
        if (args->flags & NFS_MOUNT_VER3)
                goto out_v3_not_compiled;
@@ -1398,6 +1680,80 @@ out_invalid_fh:
        return -EINVAL;
 }
+static int
+nfs_compare_remount_data(struct nfs_server *nfss,
+                         struct nfs_parsed_mount_data *data)
+{
+        if (data->flags != nfss->flags ||
+            data->rsize != nfss->rsize ||
+            data->wsize != nfss->wsize ||
+            data->retrans != nfss->client->cl_timeout->to_retries ||
+            data->auth_flavors[0] != nfss->client->cl_auth->au_flavor ||
+            data->acregmin != nfss->acregmin / HZ ||
+            data->acregmax != nfss->acregmax / HZ ||
+            data->acdirmin != nfss->acdirmin / HZ ||
+            data->acdirmax != nfss->acdirmax / HZ ||
+            data->timeo != (10U * nfss->client->cl_timeout->to_initval / HZ) ||
+            data->nfs_server.addrlen != nfss->nfs_client->cl_addrlen ||
+            memcmp(&data->nfs_server.address, &nfss->nfs_client->cl_addr,
+                   data->nfs_server.addrlen) != 0)
+                return -EINVAL;
+        return 0;
+}
+static int
+nfs_remount(struct super_block *sb, int *flags, char *raw_data)
+{
+        int error;
+        struct nfs_server *nfss = sb->s_fs_info;
+        struct nfs_parsed_mount_data *data;
+        struct nfs_mount_data *options = (struct nfs_mount_data *)raw_data;
+        struct nfs4_mount_data *options4 = (struct nfs4_mount_data *)raw_data;
+        u32 nfsvers = nfss->nfs_client->rpc_ops->version;
+        /*
+         * Userspace mount programs that send binary options generally send
+         * them populated with default values. We have no way to know which
+         * ones were explicitly specified. Fall back to legacy behavior and
+         * just return success.
+         */
+        if ((nfsvers == 4 && options4->version == 1) ||
+            (nfsvers <= 3 && options->version >= 1 &&
+             options->version <= 6))
+                return 0;
+        data = kzalloc(sizeof(*data), GFP_KERNEL);
+        if (data == NULL)
+                return -ENOMEM;
+        /* fill out struct with values from existing mount */
+        data->flags = nfss->flags;
+        data->rsize = nfss->rsize;
+        data->wsize = nfss->wsize;
+        data->retrans = nfss->client->cl_timeout->to_retries;
+        data->auth_flavors[0] = nfss->client->cl_auth->au_flavor;
+        data->acregmin = nfss->acregmin / HZ;
+        data->acregmax = nfss->acregmax / HZ;
+        data->acdirmin = nfss->acdirmin / HZ;
+        data->acdirmax = nfss->acdirmax / HZ;
+        data->timeo = 10U * nfss->client->cl_timeout->to_initval / HZ;
+        data->nfs_server.addrlen = nfss->nfs_client->cl_addrlen;
+        memcpy(&data->nfs_server.address, &nfss->nfs_client->cl_addr,
+                data->nfs_server.addrlen);
+        /* overwrite those values with any that were specified */
+        error = nfs_parse_mount_options((char *)options, data);
+        if (error < 0)
+                goto out;
+        /* compare new mount options with old ones */
+        error = nfs_compare_remount_data(nfss, data);
+out:
+        kfree(data);
+        return error;
+}
 /*
 * Initialise the common bits of the superblock
 */
@@ -1585,24 +1941,29 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 {
        struct nfs_server *server = NULL;
        struct super_block *s;
-        struct nfs_fh mntfh;
+        struct nfs_parsed_mount_data *data;
-        struct nfs_parsed_mount_data data;
+        struct nfs_fh *mntfh;
        struct dentry *mntroot;
        int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
        struct nfs_sb_mountdata sb_mntdata = {
                .mntflags = flags,
        };
-        int error;
+        int error = -ENOMEM;
-        security_init_mnt_opts(&data.lsm_opts);
+        data = kzalloc(sizeof(*data), GFP_KERNEL);
+        mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL);
+        if (data == NULL || mntfh == NULL)
+                goto out_free_fh;
+        security_init_mnt_opts(&data->lsm_opts);
        /* Validate the mount data */
-        error = nfs_validate_mount_data(raw_data, &data, &mntfh, dev_name);
+        error = nfs_validate_mount_data(raw_data, data, mntfh, dev_name);
        if (error < 0)
                goto out;
        /* Get a volume representation */
-        server = nfs_create_server(&data, &mntfh);
+        server = nfs_create_server(data, mntfh);
        if (IS_ERR(server)) {
                error = PTR_ERR(server);
                goto out;
@@ -1630,16 +1991,16 @@ static int nfs_get_sb(struct file_system_type *fs_type,
        if (!s->s_root) {
                /* initial superblock/root creation */
-                nfs_fill_super(s, &data);
+                nfs_fill_super(s, data);
        }
-        mntroot = nfs_get_root(s, &mntfh);
+        mntroot = nfs_get_root(s, mntfh);
        if (IS_ERR(mntroot)) {
                error = PTR_ERR(mntroot);
                goto error_splat_super;
        }
-        error = security_sb_set_mnt_opts(s, &data.lsm_opts);
+        error = security_sb_set_mnt_opts(s, &data->lsm_opts);
        if (error)
                goto error_splat_root;
@@ -1649,9 +2010,12 @@ static int nfs_get_sb(struct file_system_type *fs_type,
        error = 0;
 out:
-        kfree(data.nfs_server.hostname);
+        kfree(data->nfs_server.hostname);
-        kfree(data.mount_server.hostname);
+        kfree(data->mount_server.hostname);
-        security_free_mnt_opts(&data.lsm_opts);
+        security_free_mnt_opts(&data->lsm_opts);
+out_free_fh:
+        kfree(mntfh);
+        kfree(data);
        return error;
 out_err_nosb:
@@ -1800,21 +2164,18 @@ static int nfs4_validate_mount_data(void *options,
        struct nfs4_mount_data *data = (struct nfs4_mount_data *)options;
        char *c;
-        memset(args, 0, sizeof(*args));
        if (data == NULL)
                goto out_no_data;
        args->rsize             = NFS_MAX_FILE_IO_SIZE;
        args->wsize             = NFS_MAX_FILE_IO_SIZE;
-        args->timeo             = 600;
+        args->acregmin          = NFS_DEF_ACREGMIN;
-        args->retrans           = 2;
+        args->acregmax          = NFS_DEF_ACREGMAX;
-        args->acregmin          = 3;
+        args->acdirmin          = NFS_DEF_ACDIRMIN;
-        args->acregmax          = 60;
+        args->acdirmax          = NFS_DEF_ACDIRMAX;
-        args->acdirmin          = 30;
-        args->acdirmax          = 60;
        args->nfs_server.port   = NFS_PORT; /* 2049 unless user set port= */
-        args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
+        args->auth_flavors[0]   = RPC_AUTH_UNIX;
+        args->auth_flavor_len   = 0;
        switch (data->version) {
        case 1:
@@ -1830,18 +2191,13 @@ static int nfs4_validate_mount_data(void *options,
                                                &args->nfs_server.address))
                        goto out_no_address;
-                switch (data->auth_flavourlen) {
+                if (data->auth_flavourlen) {
-                case 0:
+                        if (data->auth_flavourlen > 1)
-                        args->auth_flavors[0] = RPC_AUTH_UNIX;
+                                goto out_inval_auth;
-                        break;
-                case 1:
                        if (copy_from_user(&args->auth_flavors[0],
                                           data->auth_flavours,
                                           sizeof(args->auth_flavors[0])))
                                return -EFAULT;
-                        break;
-                default:
-                        goto out_inval_auth;
                }
                c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN);
@@ -1875,10 +2231,11 @@ static int nfs4_validate_mount_data(void *options,
                args->acdirmin  = data->acdirmin;
                args->acdirmax  = data->acdirmax;
                args->nfs_server.protocol = data->proto;
+                nfs_validate_transport_protocol(args);
                break;
        default: {
-                unsigned int len;
+                int status;
                if (nfs_parse_mount_options((char *)options, args) == 0)
                        return -EINVAL;
@@ -1887,44 +2244,25 @@ static int nfs4_validate_mount_data(void *options,
                                                &args->nfs_server.address))
                        return -EINVAL;
-                switch (args->auth_flavor_len) {
+                nfs_set_port((struct sockaddr *)&args->nfs_server.address,
-                case 0:
+                                args->nfs_server.port);
-                        args->auth_flavors[0] = RPC_AUTH_UNIX;
-                        break;
-                case 1:
-                        break;
-                default:
-                        goto out_inval_auth;
-                }
-                /*
-                 * Split "dev_name" into "hostname:mntpath".
-                 */
-                c = strchr(dev_name, ':');
-                if (c == NULL)
-                        return -EINVAL;
-                /* while calculating len, pretend ':' is '\0' */
-                len = c - dev_name;
-                if (len > NFS4_MAXNAMLEN)
-                        return -ENAMETOOLONG;
-                /* N.B. caller will free nfs_server.hostname in all cases */
-                args->nfs_server.hostname = kstrndup(dev_name, len, GFP_KERNEL);
-                if (!args->nfs_server.hostname)
-                        goto out_nomem;
-                c++;                    /* step over the ':' */
+                nfs_validate_transport_protocol(args);
-                len = strlen(c);
-                if (len > NFS4_MAXPATHLEN)
-                        return -ENAMETOOLONG;
-                args->nfs_server.export_path = kstrndup(c, len, GFP_KERNEL);
-                if (!args->nfs_server.export_path)
-                        goto out_nomem;
-                dprintk("NFS: MNTPATH: '%s'\n", args->nfs_server.export_path);
+                if (args->auth_flavor_len > 1)
+                        goto out_inval_auth;
                if (args->client_address == NULL)
                        goto out_no_client_address;
+                status = nfs_parse_devname(dev_name,
+                                           &args->nfs_server.hostname,
+                                           NFS4_MAXNAMLEN,
+                                           &args->nfs_server.export_path,
+                                           NFS4_MAXPATHLEN);
+                if (status < 0)
+                        return status;
                break;
                }
        }
@@ -1940,10 +2278,6 @@ out_inval_auth:
                 data->auth_flavourlen);
        return -EINVAL;
-out_nomem:
-        dfprintk(MOUNT, "NFS4: not enough memory to handle mount options\n");
-        return -ENOMEM;
 out_no_address:
        dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n");
        return -EINVAL;
@@ -1959,26 +2293,31 @@ out_no_client_address:
 static int nfs4_get_sb(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
 {
-        struct nfs_parsed_mount_data data;
+        struct nfs_parsed_mount_data *data;
        struct super_block *s;
        struct nfs_server *server;
-        struct nfs_fh mntfh;
+        struct nfs_fh *mntfh;
        struct dentry *mntroot;
        int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
        struct nfs_sb_mountdata sb_mntdata = {
                .mntflags = flags,
        };
-        int error;
+        int error = -ENOMEM;
+        data = kzalloc(sizeof(*data), GFP_KERNEL);
+        mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL);
+        if (data == NULL || mntfh == NULL)
+                goto out_free_fh;
-        security_init_mnt_opts(&data.lsm_opts);
+        security_init_mnt_opts(&data->lsm_opts);
        /* Validate the mount data */
-        error = nfs4_validate_mount_data(raw_data, &data, dev_name);
+        error = nfs4_validate_mount_data(raw_data, data, dev_name);
        if (error < 0)
                goto out;
        /* Get a volume representation */
-        server = nfs4_create_server(&data, &mntfh);
+        server = nfs4_create_server(data, mntfh);
        if (IS_ERR(server)) {
                error = PTR_ERR(server);
                goto out;
@@ -2009,13 +2348,13 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
                nfs4_fill_super(s);
        }
-        mntroot = nfs4_get_root(s, &mntfh);
+        mntroot = nfs4_get_root(s, mntfh);
        if (IS_ERR(mntroot)) {
                error = PTR_ERR(mntroot);
                goto error_splat_super;
        }
-        error = security_sb_set_mnt_opts(s, &data.lsm_opts);
+        error = security_sb_set_mnt_opts(s, &data->lsm_opts);
        if (error)
                goto error_splat_root;
@@ -2025,10 +2364,13 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
        error = 0;
 out:
-        kfree(data.client_address);
+        kfree(data->client_address);
-        kfree(data.nfs_server.export_path);
+        kfree(data->nfs_server.export_path);
-        kfree(data.nfs_server.hostname);
+        kfree(data->nfs_server.hostname);
-        security_free_mnt_opts(&data.lsm_opts);
+        security_free_mnt_opts(&data->lsm_opts);
+out_free_fh:
+        kfree(mntfh);
+        kfree(data);
        return error;
 out_free:
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 6d8ace3e3259..3229e217c773 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -34,9 +34,6 @@
 /*
 * Local function declarations
 */
-static struct nfs_page * nfs_update_request(struct nfs_open_context*,
-                                            struct page *,
-                                            unsigned int, unsigned int);
 static void nfs_pageio_init_write(struct nfs_pageio_descriptor *desc,
                                  struct inode *inode, int ioflags);
 static void nfs_redirty_request(struct nfs_page *req);
@@ -136,16 +133,21 @@ static struct nfs_page *nfs_page_find_request(struct page *page)
 static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count)
 {
        struct inode *inode = page->mapping->host;
-        loff_t end, i_size = i_size_read(inode);
+        loff_t end, i_size;
-        pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
+        pgoff_t end_index;
+        spin_lock(&inode->i_lock);
+        i_size = i_size_read(inode);
+        end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
        if (i_size > 0 && page->index < end_index)
-                return;
+                goto out;
        end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + ((loff_t)offset+count);
        if (i_size >= end)
-                return;
+                goto out;
-        nfs_inc_stats(inode, NFSIOS_EXTENDWRITE);
        i_size_write(inode, end);
+        nfs_inc_stats(inode, NFSIOS_EXTENDWRITE);
+out:
+        spin_unlock(&inode->i_lock);
 }
 /* A writeback failed: mark the page as bad, and invalidate the page cache */
@@ -169,29 +171,6 @@ static void nfs_mark_uptodate(struct page *page, unsigned int base, unsigned int
        SetPageUptodate(page);
 }
-static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
-                unsigned int offset, unsigned int count)
-{
-        struct nfs_page *req;
-        int ret;
-        for (;;) {
-                req = nfs_update_request(ctx, page, offset, count);
-                if (!IS_ERR(req))
-                        break;
-                ret = PTR_ERR(req);
-                if (ret != -EBUSY)
-                        return ret;
-                ret = nfs_wb_page(page->mapping->host, page);
-                if (ret != 0)
-                        return ret;
-        }
-        /* Update file length */
-        nfs_grow_file(page, offset, count);
-        nfs_clear_page_tag_locked(req);
-        return 0;
-}
 static int wb_priority(struct writeback_control *wbc)
 {
        if (wbc->for_reclaim)
@@ -268,12 +247,9 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
                        return ret;
                spin_lock(&inode->i_lock);
        }
-        if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) {
+        if (test_bit(PG_CLEAN, &req->wb_flags)) {
-                /* This request is marked for commit */
                spin_unlock(&inode->i_lock);
-                nfs_clear_page_tag_locked(req);
+                BUG();
-                nfs_pageio_complete(pgio);
-                return 0;
        }
        if (nfs_set_page_writeback(page) != 0) {
                spin_unlock(&inode->i_lock);
@@ -355,11 +331,19 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
 /*
 * Insert a write request into an inode
 */
-static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
+static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
        int error;
+        error = radix_tree_preload(GFP_NOFS);
+        if (error != 0)
+                goto out;
+        /* Lock the request! */
+        nfs_lock_request_dontget(req);
+        spin_lock(&inode->i_lock);
        error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req);
        BUG_ON(error);
        if (!nfsi->npages) {
@@ -373,6 +357,10 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
        kref_get(&req->wb_kref);
        radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index,
                                NFS_PAGE_TAG_LOCKED);
+        spin_unlock(&inode->i_lock);
+        radix_tree_preload_end();
+out:
+        return error;
 }
 /*
@@ -405,19 +393,6 @@ nfs_mark_request_dirty(struct nfs_page *req)
        __set_page_dirty_nobuffers(req->wb_page);
 }
-/*
- * Check if a request is dirty
- */
-static inline int
-nfs_dirty_request(struct nfs_page *req)
-{
-        struct page *page = req->wb_page;
-        if (page == NULL || test_bit(PG_NEED_COMMIT, &req->wb_flags))
-                return 0;
-        return !PageWriteback(page);
-}
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
 /*
 * Add a request to the inode's commit list.
@@ -430,7 +405,7 @@ nfs_mark_request_commit(struct nfs_page *req)
        spin_lock(&inode->i_lock);
        nfsi->ncommit++;
-        set_bit(PG_NEED_COMMIT, &(req)->wb_flags);
+        set_bit(PG_CLEAN, &(req)->wb_flags);
        radix_tree_tag_set(&nfsi->nfs_page_tree,
                        req->wb_index,
                        NFS_PAGE_TAG_COMMIT);
@@ -440,6 +415,19 @@ nfs_mark_request_commit(struct nfs_page *req)
        __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
 }
+static int
+nfs_clear_request_commit(struct nfs_page *req)
+{
+        struct page *page = req->wb_page;
+        if (test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) {
+                dec_zone_page_state(page, NR_UNSTABLE_NFS);
+                dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE);
+                return 1;
+        }
+        return 0;
+}
 static inline
 int nfs_write_need_commit(struct nfs_write_data *data)
 {
@@ -449,7 +437,7 @@ int nfs_write_need_commit(struct nfs_write_data *data)
 static inline
 int nfs_reschedule_unstable_write(struct nfs_page *req)
 {
-        if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) {
+        if (test_and_clear_bit(PG_NEED_COMMIT, &req->wb_flags)) {
                nfs_mark_request_commit(req);
                return 1;
        }
@@ -465,6 +453,12 @@ nfs_mark_request_commit(struct nfs_page *req)
 {
 }
+static inline int
+nfs_clear_request_commit(struct nfs_page *req)
+{
+        return 0;
+}
 static inline
 int nfs_write_need_commit(struct nfs_write_data *data)
 {
@@ -522,11 +516,8 @@ static void nfs_cancel_commit_list(struct list_head *head)
        while(!list_empty(head)) {
                req = nfs_list_entry(head->next);
-                dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
-                dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
-                                BDI_RECLAIMABLE);
                nfs_list_remove_request(req);
-                clear_bit(PG_NEED_COMMIT, &(req)->wb_flags);
+                nfs_clear_request_commit(req);
                nfs_inode_remove_request(req);
                nfs_unlock_request(req);
        }
@@ -564,110 +555,124 @@ static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pg
 #endif
 /*
- * Try to update any existing write request, or create one if there is none.
+ * Search for an existing write request, and attempt to update
- * In order to match, the request's credentials must match those of
+ * it to reflect a new dirty region on a given page.
- * the calling process.
 *
- * Note: Should always be called with the Page Lock held!
+ * If the attempt fails, then the existing request is flushed out
+ * to disk.
 */
-static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx,
+static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
-                struct page *page, unsigned int offset, unsigned int bytes)
+                struct page *page,
+                unsigned int offset,
+                unsigned int bytes)
 {
-        struct address_space *mapping = page->mapping;
+        struct nfs_page *req;
-        struct inode *inode = mapping->host;
+        unsigned int rqend;
-        struct nfs_page         *req, *new = NULL;
+        unsigned int end;
-        pgoff_t         rqend, end;
+        int error;
+        if (!PagePrivate(page))
+                return NULL;
        end = offset + bytes;
+        spin_lock(&inode->i_lock);
        for (;;) {
-                /* Loop over all inode entries and see if we find
+                req = nfs_page_find_request_locked(page);
-                 * A request for the page we wish to update
+                if (req == NULL)
+                        goto out_unlock;
+                rqend = req->wb_offset + req->wb_bytes;
+                /*
+                 * Tell the caller to flush out the request if
+                 * the offsets are non-contiguous.
+                 * Note: nfs_flush_incompatible() will already
+                 * have flushed out requests having wrong owners.
                 */
-                if (new) {
+                if (offset > rqend
-                        if (radix_tree_preload(GFP_NOFS)) {
+                    || end < req->wb_offset)
-                                nfs_release_request(new);
+                        goto out_flushme;
-                                return ERR_PTR(-ENOMEM);
-                        }
-                }
-                spin_lock(&inode->i_lock);
+                if (nfs_set_page_tag_locked(req))
-                req = nfs_page_find_request_locked(page);
-                if (req) {
-                        if (!nfs_set_page_tag_locked(req)) {
-                                int error;
-                                spin_unlock(&inode->i_lock);
-                                error = nfs_wait_on_request(req);
-                                nfs_release_request(req);
-                                if (error < 0) {
-                                        if (new) {
-                                                radix_tree_preload_end();
-                                                nfs_release_request(new);
-                                        }
-                                        return ERR_PTR(error);
-                                }
-                                continue;
-                        }
-                        spin_unlock(&inode->i_lock);
-                        if (new) {
-                                radix_tree_preload_end();
-                                nfs_release_request(new);
-                        }
                        break;
-                }
-                if (new) {
+                /* The request is locked, so wait and then retry */
-                        nfs_lock_request_dontget(new);
-                        nfs_inode_add_request(inode, new);
-                        spin_unlock(&inode->i_lock);
-                        radix_tree_preload_end();
-                        req = new;
-                        goto zero_page;
-                }
                spin_unlock(&inode->i_lock);
+                error = nfs_wait_on_request(req);
-                new = nfs_create_request(ctx, inode, page, offset, bytes);
+                nfs_release_request(req);
-                if (IS_ERR(new))
+                if (error != 0)
-                        return new;
+                        goto out_err;
+                spin_lock(&inode->i_lock);
        }
-        /* We have a request for our page.
+        if (nfs_clear_request_commit(req))
-         * If the creds don't match, or the
+                radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree,
-         * page addresses don't match,
+                                req->wb_index, NFS_PAGE_TAG_COMMIT);
-         * tell the caller to wait on the conflicting
-         * request.
-         */
-        rqend = req->wb_offset + req->wb_bytes;
-        if (req->wb_context != ctx
-            || req->wb_page != page
-            || !nfs_dirty_request(req)
-            || offset > rqend || end < req->wb_offset) {
-                nfs_clear_page_tag_locked(req);
-                return ERR_PTR(-EBUSY);
-        }
        /* Okay, the request matches. Update the region */
        if (offset < req->wb_offset) {
                req->wb_offset = offset;
                req->wb_pgbase = offset;
-                req->wb_bytes = max(end, rqend) - req->wb_offset;
-                goto zero_page;
        }
        if (end > rqend)
                req->wb_bytes = end - req->wb_offset;
+        else
+                req->wb_bytes = rqend - req->wb_offset;
+out_unlock:
+        spin_unlock(&inode->i_lock);
        return req;
-zero_page:
+out_flushme:
-        /* If this page might potentially be marked as up to date,
+        spin_unlock(&inode->i_lock);
-         * then we need to zero any uninitalised data. */
+        nfs_release_request(req);
-        if (req->wb_pgbase == 0 && req->wb_bytes != PAGE_CACHE_SIZE
+        error = nfs_wb_page(inode, page);
-                        && !PageUptodate(req->wb_page))
+out_err:
-                zero_user_segment(req->wb_page, req->wb_bytes, PAGE_CACHE_SIZE);
+        return ERR_PTR(error);
+}
+/*
+ * Try to update an existing write request, or create one if there is none.
+ *
+ * Note: Should always be called with the Page Lock held to prevent races
+ * if we have to add a new request. Also assumes that the caller has
+ * already called nfs_flush_incompatible() if necessary.
+ */
+static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
+                struct page *page, unsigned int offset, unsigned int bytes)
+{
+        struct inode *inode = page->mapping->host;
+        struct nfs_page *req;
+        int error;
+        req = nfs_try_to_update_request(inode, page, offset, bytes);
+        if (req != NULL)
+                goto out;
+        req = nfs_create_request(ctx, inode, page, offset, bytes);
+        if (IS_ERR(req))
+                goto out;
+        error = nfs_inode_add_request(inode, req);
+        if (error != 0) {
+                nfs_release_request(req);
+                req = ERR_PTR(error);
+        }
+out:
        return req;
 }
+static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
+                unsigned int offset, unsigned int count)
+{
+        struct nfs_page *req;
+        req = nfs_setup_write_request(ctx, page, offset, count);
+        if (IS_ERR(req))
+                return PTR_ERR(req);
+        /* Update file length */
+        nfs_grow_file(page, offset, count);
+        nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
+        nfs_clear_page_tag_locked(req);
+        return 0;
+}
 int nfs_flush_incompatible(struct file *file, struct page *page)
 {
        struct nfs_open_context *ctx = nfs_file_open_context(file);
@@ -685,8 +690,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
                req = nfs_page_find_request(page);
                if (req == NULL)
                        return 0;
-                do_flush = req->wb_page != page || req->wb_context != ctx
+                do_flush = req->wb_page != page || req->wb_context != ctx;
-                        || !nfs_dirty_request(req);
                nfs_release_request(req);
                if (!do_flush)
                        return 0;
@@ -721,10 +725,10 @@ int nfs_updatepage(struct file *file, struct page *page,
        nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE);
-        dprintk("NFS:      nfs_updatepage(%s/%s %d@%Ld)\n",
+        dprintk("NFS:       nfs_updatepage(%s/%s %d@%lld)\n",
                file->f_path.dentry->d_parent->d_name.name,
                file->f_path.dentry->d_name.name, count,
-                (long long)(page_offset(page) +offset));
+                (long long)(page_offset(page) + offset));
        /* If we're not using byte range locks, and we know the page
         * is up to date, it may be more efficient to extend the write
@@ -739,24 +743,20 @@ int nfs_updatepage(struct file *file, struct page *page,
        }
        status = nfs_writepage_setup(ctx, page, offset, count);
-        __set_page_dirty_nobuffers(page);
-        dprintk("NFS:      nfs_updatepage returns %d (isize %Ld)\n",
-                        status, (long long)i_size_read(inode));
        if (status < 0)
                nfs_set_pageerror(page);
+        else
+                __set_page_dirty_nobuffers(page);
+        dprintk("NFS:       nfs_updatepage returns %d (isize %lld)\n",
+                        status, (long long)i_size_read(inode));
        return status;
 }
 static void nfs_writepage_release(struct nfs_page *req)
 {
-        if (PageError(req->wb_page)) {
+        if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req)) {
-                nfs_end_page_writeback(req->wb_page);
-                nfs_inode_remove_request(req);
-        } else if (!nfs_reschedule_unstable_write(req)) {
-                /* Set the PG_uptodate flag */
-                nfs_mark_uptodate(req->wb_page, req->wb_pgbase, req->wb_bytes);
                nfs_end_page_writeback(req->wb_page);
                nfs_inode_remove_request(req);
        } else
@@ -833,7 +833,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
        NFS_PROTO(inode)->write_setup(data, &msg);
        dprintk("NFS: %5u initiated write call "
-                "(req %s/%Ld, %u bytes @ offset %Lu)\n",
+                "(req %s/%lld, %u bytes @ offset %llu)\n",
                data->task.tk_pid,
                inode->i_sb->s_id,
                (long long)NFS_FILEID(inode),
@@ -977,13 +977,13 @@ static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
 static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata)
 {
        struct nfs_write_data   *data = calldata;
-        struct nfs_page         *req = data->req;
-        dprintk("NFS: write (%s/%Ld %d@%Ld)",
+        dprintk("NFS: %5u write(%s/%lld %d@%lld)",
-                req->wb_context->path.dentry->d_inode->i_sb->s_id,
+                task->tk_pid,
-                (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
+                data->req->wb_context->path.dentry->d_inode->i_sb->s_id,
-                req->wb_bytes,
+                (long long)
-                (long long)req_offset(req));
+                  NFS_FILEID(data->req->wb_context->path.dentry->d_inode),
+                data->req->wb_bytes, (long long)req_offset(data->req));
        nfs_writeback_done(task, data);
 }
@@ -1057,7 +1057,8 @@ static void nfs_writeback_release_full(void *calldata)
                nfs_list_remove_request(req);
-                dprintk("NFS: write (%s/%Ld %d@%Ld)",
+                dprintk("NFS: %5u write (%s/%lld %d@%lld)",
+                        data->task.tk_pid,
                        req->wb_context->path.dentry->d_inode->i_sb->s_id,
                        (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
                        req->wb_bytes,
@@ -1077,8 +1078,6 @@ static void nfs_writeback_release_full(void *calldata)
                        dprintk(" marked for commit\n");
                        goto next;
                }
-                /* Set the PG_uptodate flag? */
-                nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
                dprintk(" OK\n");
 remove_request:
                nfs_end_page_writeback(page);
@@ -1132,7 +1131,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
                static unsigned long    complain;
                if (time_before(complain, jiffies)) {
-                        dprintk("NFS: faulty NFS server %s:"
+                        dprintk("NFS:       faulty NFS server %s:"
                                " (committed = %d) != (stable = %d)\n",
                                NFS_SERVER(data->inode)->nfs_client->cl_hostname,
                                resp->verf->committed, argp->stable);
@@ -1296,12 +1295,9 @@ static void nfs_commit_release(void *calldata)
        while (!list_empty(&data->pages)) {
                req = nfs_list_entry(data->pages.next);
                nfs_list_remove_request(req);
-                clear_bit(PG_NEED_COMMIT, &(req)->wb_flags);
+                nfs_clear_request_commit(req);
-                dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
-                dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
-                                BDI_RECLAIMABLE);
-                dprintk("NFS: commit (%s/%Ld %d@%Ld)",
+                dprintk("NFS:       commit (%s/%lld %d@%lld)",
                        req->wb_context->path.dentry->d_inode->i_sb->s_id,
                        (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
                        req->wb_bytes,
@@ -1317,9 +1313,6 @@ static void nfs_commit_release(void *calldata)
                 * returned by the server against all stored verfs. */
                if (!memcmp(req->wb_verf.verifier, data->verf.verifier, sizeof(data->verf.verifier))) {
                        /* We have a match */
-                        /* Set the PG_uptodate flag */
-                        nfs_mark_uptodate(req->wb_page, req->wb_pgbase,
-                                        req->wb_bytes);
                        nfs_inode_remove_request(req);
                        dprintk(" OK\n");
                        goto next;
@@ -1478,7 +1471,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
                req = nfs_page_find_request(page);
                if (req == NULL)
                        goto out;
-                if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) {
+                if (test_bit(PG_CLEAN, &req->wb_flags)) {
                        nfs_release_request(req);
                        break;
                }
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 4d4760e687c3..702fa577aa6e 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -381,7 +381,7 @@ static int do_probe_callback(void *data)
                .program        = &cb_program,
                .version        = nfs_cb_version[1]->number,
                .authflavor     = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */
-                .flags          = (RPC_CLNT_CREATE_NOPING),
+                .flags          = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
        };
        struct rpc_message msg = {
                .rpc_proc       = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index cf9401e8cd0b..cfdb08b484ed 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -21,7 +21,6 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/sysctl.h>
 #include <linux/configfs.h>
 #include "tcp.h"
@@ -36,65 +35,6 @@
 * cluster references throughout where nodes are looked up */
 struct o2nm_cluster *o2nm_single_cluster = NULL;
-#define OCFS2_MAX_HB_CTL_PATH 256
-static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl";
-static ctl_table ocfs2_nm_table[] = {
-        {
-                .ctl_name       = 1,
-                .procname       = "hb_ctl_path",
-                .data           = ocfs2_hb_ctl_path,
-                .maxlen         = OCFS2_MAX_HB_CTL_PATH,
-                .mode           = 0644,
-                .proc_handler   = &proc_dostring,
-                .strategy       = &sysctl_string,
-        },
-        { .ctl_name = 0 }
-};
-static ctl_table ocfs2_mod_table[] = {
-        {
-                .ctl_name       = FS_OCFS2_NM,
-                .procname       = "nm",
-                .data           = NULL,
-                .maxlen         = 0,
-                .mode           = 0555,
-                .child          = ocfs2_nm_table
-        },
-        { .ctl_name = 0}
-};
-static ctl_table ocfs2_kern_table[] = {
-        {
-                .ctl_name       = FS_OCFS2,
-                .procname       = "ocfs2",
-                .data           = NULL,
-                .maxlen         = 0,
-                .mode           = 0555,
-                .child          = ocfs2_mod_table
-        },
-        { .ctl_name = 0}
-};
-static ctl_table ocfs2_root_table[] = {
-        {
-                .ctl_name       = CTL_FS,
-                .procname       = "fs",
-                .data           = NULL,
-                .maxlen         = 0,
-                .mode           = 0555,
-                .child          = ocfs2_kern_table
-        },
-        { .ctl_name = 0 }
-};
-static struct ctl_table_header *ocfs2_table_header = NULL;
-const char *o2nm_get_hb_ctl_path(void)
-{
-        return ocfs2_hb_ctl_path;
-}
-EXPORT_SYMBOL_GPL(o2nm_get_hb_ctl_path);
 struct o2nm_node *o2nm_get_node_by_num(u8 node_num)
 {
@@ -941,9 +881,6 @@ void o2nm_undepend_this_node(void)
 static void __exit exit_o2nm(void)
 {
-        if (ocfs2_table_header)
-                unregister_sysctl_table(ocfs2_table_header);
        /* XXX sync with hb callbacks and shut down hb? */
        o2net_unregister_hb_callbacks();
        configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys);
@@ -964,16 +901,9 @@ static int __init init_o2nm(void)
        if (ret)
                goto out;
-        ocfs2_table_header = register_sysctl_table(ocfs2_root_table);
-        if (!ocfs2_table_header) {
-                printk(KERN_ERR "nodemanager: unable to register sysctl\n");
-                ret = -ENOMEM; /* or something. */
-                goto out_o2net;
-        }
        ret = o2net_register_hb_callbacks();
        if (ret)
-                goto out_sysctl;
+                goto out_o2net;
        config_group_init(&o2nm_cluster_group.cs_subsys.su_group);
        mutex_init(&o2nm_cluster_group.cs_subsys.su_mutex);
@@ -990,8 +920,6 @@ static int __init init_o2nm(void)
        configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys);
 out_callbacks:
        o2net_unregister_hb_callbacks();
-out_sysctl:
-        unregister_sysctl_table(ocfs2_table_header);
 out_o2net:
        o2net_exit();
 out:
diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h
index 7c860361b8dd..c992ea0da4ad 100644
--- a/fs/ocfs2/cluster/nodemanager.h
+++ b/fs/ocfs2/cluster/nodemanager.h
@@ -33,10 +33,6 @@
 #include <linux/configfs.h>
 #include <linux/rbtree.h>
-#define FS_OCFS2_NM             1
-const char *o2nm_get_hb_ctl_path(void);
 struct o2nm_node {
        spinlock_t              nd_lock;
        struct config_item      nd_item;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index efc015c6128a..44f87caf3683 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -606,7 +606,9 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
        res->last_used = 0;
+        spin_lock(&dlm->spinlock);
        list_add_tail(&res->tracking, &dlm->tracking_list);
+        spin_unlock(&dlm->spinlock);
        memset(res->lvb, 0, DLM_LVB_LEN);
        memset(res->refmap, 0, sizeof(res->refmap));
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 394d25a131a5..80e20d9f2780 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1554,8 +1554,8 @@ out:
 */
 int ocfs2_file_lock(struct file *file, int ex, int trylock)
 {
-        int ret, level = ex ? LKM_EXMODE : LKM_PRMODE;
+        int ret, level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
-        unsigned int lkm_flags = trylock ? LKM_NOQUEUE : 0;
+        unsigned int lkm_flags = trylock ? DLM_LKF_NOQUEUE : 0;
        unsigned long flags;
        struct ocfs2_file_private *fp = file->private_data;
        struct ocfs2_lock_res *lockres = &fp->fp_flock;
@@ -1582,7 +1582,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
                 * Get the lock at NLMODE to start - that way we
                 * can cancel the upconvert request if need be.
                 */
-                ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0);
+                ret = ocfs2_lock_create(osb, lockres, DLM_LOCK_NL, 0);
                if (ret < 0) {
                        mlog_errno(ret);
                        goto out;
@@ -1597,7 +1597,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
        }
        lockres->l_action = OCFS2_AST_CONVERT;
-        lkm_flags |= LKM_CONVERT;
+        lkm_flags |= DLM_LKF_CONVERT;
        lockres->l_requested = level;
        lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
@@ -1664,7 +1664,7 @@ void ocfs2_file_unlock(struct file *file)
        if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED))
                return;
-        if (lockres->l_level == LKM_NLMODE)
+        if (lockres->l_level == DLM_LOCK_NL)
                return;
        mlog(0, "Unlock: \"%s\" flags: 0x%lx, level: %d, act: %d\n",
@@ -1678,11 +1678,11 @@ void ocfs2_file_unlock(struct file *file)
        lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
        lockres->l_blocking = DLM_LOCK_EX;
-        gen = ocfs2_prepare_downconvert(lockres, LKM_NLMODE);
+        gen = ocfs2_prepare_downconvert(lockres, DLM_LOCK_NL);
        lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
        spin_unlock_irqrestore(&lockres->l_lock, flags);
-        ret = ocfs2_downconvert_lock(osb, lockres, LKM_NLMODE, 0, gen);
+        ret = ocfs2_downconvert_lock(osb, lockres, DLM_LOCK_NL, 0, gen);
        if (ret) {
                mlog_errno(ret);
                return;
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index bbd1667aa7d3..fcd120f1493a 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -317,8 +317,7 @@ out:
        return rc;
 }
-static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn,
+static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn)
-                                   int hangup_pending)
 {
        struct dlm_ctxt *dlm = conn->cc_lockspace;
        struct o2dlm_private *priv = conn->cc_private;
@@ -333,43 +332,6 @@ static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn,
        return 0;
 }
-static void o2hb_stop(const char *group)
-{
-        int ret;
-        char *argv[5], *envp[3];
-        argv[0] = (char *)o2nm_get_hb_ctl_path();
-        argv[1] = "-K";
-        argv[2] = "-u";
-        argv[3] = (char *)group;
-        argv[4] = NULL;
-        mlog(0, "Run: %s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]);
-        /* minimal command environment taken from cpu_run_sbin_hotplug */
-        envp[0] = "HOME=/";
-        envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
-        envp[2] = NULL;
-        ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
-        if (ret < 0)
-                mlog_errno(ret);
-}
-/*
- * Hangup is a hack for tools compatibility.  Older ocfs2-tools software
- * expects the filesystem to call "ocfs2_hb_ctl" during unmount.  This
- * happens regardless of whether the DLM got started, so we can't do it
- * in ocfs2_cluster_disconnect().  We bring the o2hb_stop() function into
- * the glue and provide a "hangup" API for super.c to call.
- *
- * Other stacks will eventually provide a NULL ->hangup() pointer.
- */
-static void o2cb_cluster_hangup(const char *group, int grouplen)
-{
-        o2hb_stop(group);
-}
 static int o2cb_cluster_this_node(unsigned int *node)
 {
        int node_num;
@@ -388,7 +350,6 @@ static int o2cb_cluster_this_node(unsigned int *node)
 static struct ocfs2_stack_operations o2cb_stack_ops = {
        .connect        = o2cb_cluster_connect,
        .disconnect     = o2cb_cluster_disconnect,
-        .hangup         = o2cb_cluster_hangup,
        .this_node      = o2cb_cluster_this_node,
        .dlm_lock       = o2cb_dlm_lock,
        .dlm_unlock     = o2cb_dlm_unlock,
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 6b97d11f6bf8..bd7e0f3acfc7 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -21,6 +21,7 @@
 #include <linux/fs.h>
 #include <linux/miscdevice.h>
 #include <linux/mutex.h>
+#include <linux/smp_lock.h>
 #include <linux/reboot.h>
 #include <asm/uaccess.h>
@@ -619,10 +620,12 @@ static int ocfs2_control_open(struct inode *inode, struct file *file)
                return -ENOMEM;
        p->op_this_node = -1;
+        lock_kernel();
        mutex_lock(&ocfs2_control_lock);
        file->private_data = p;
        list_add(&p->op_list, &ocfs2_control_private_list);
        mutex_unlock(&ocfs2_control_lock);
+        unlock_kernel();
        return 0;
 }
@@ -816,8 +819,7 @@ out:
        return rc;
 }
-static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn,
+static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn)
-                                   int hangup_pending)
 {
        dlm_release_lockspace(conn->cc_lockspace, 2);
        conn->cc_lockspace = NULL;
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 119f60cea9cc..10e149ae5e3a 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -26,6 +26,7 @@
 #include <linux/fs.h>
 #include <linux/kobject.h>
 #include <linux/sysfs.h>
+#include <linux/sysctl.h>
 #include "ocfs2_fs.h"
@@ -33,11 +34,13 @@
 #define OCFS2_STACK_PLUGIN_O2CB         "o2cb"
 #define OCFS2_STACK_PLUGIN_USER         "user"
+#define OCFS2_MAX_HB_CTL_PATH           256
 static struct ocfs2_locking_protocol *lproto;
 static DEFINE_SPINLOCK(ocfs2_stack_lock);
 static LIST_HEAD(ocfs2_stack_list);
 static char cluster_stack_name[OCFS2_STACK_LABEL_LEN + 1];
+static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl";
 /*
 * The stack currently in use.  If not null, active_stack->sp_count > 0,
@@ -349,7 +352,7 @@ int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
        BUG_ON(conn == NULL);
-        ret = active_stack->sp_ops->disconnect(conn, hangup_pending);
+        ret = active_stack->sp_ops->disconnect(conn);
        /* XXX Should we free it anyway? */
        if (!ret) {
@@ -362,13 +365,48 @@ int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
 }
 EXPORT_SYMBOL_GPL(ocfs2_cluster_disconnect);
+/*
+ * Leave the group for this filesystem.  This is executed by a userspace
+ * program (stored in ocfs2_hb_ctl_path).
+ */
+static void ocfs2_leave_group(const char *group)
+{
+        int ret;
+        char *argv[5], *envp[3];
+        argv[0] = ocfs2_hb_ctl_path;
+        argv[1] = "-K";
+        argv[2] = "-u";
+        argv[3] = (char *)group;
+        argv[4] = NULL;
+        /* minimal command environment taken from cpu_run_sbin_hotplug */
+        envp[0] = "HOME=/";
+        envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+        envp[2] = NULL;
+        ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+        if (ret < 0) {
+                printk(KERN_ERR
+                       "ocfs2: Error %d running user helper "
+                       "\"%s %s %s %s\"\n",
+                       ret, argv[0], argv[1], argv[2], argv[3]);
+        }
+}
+/*
+ * Hangup is a required post-umount.  ocfs2-tools software expects the
+ * filesystem to call "ocfs2_hb_ctl" during unmount.  This happens
+ * regardless of whether the DLM got started, so we can't do it
+ * in ocfs2_cluster_disconnect().  The ocfs2_leave_group() function does
+ * the actual work.
+ */
 void ocfs2_cluster_hangup(const char *group, int grouplen)
 {
        BUG_ON(group == NULL);
        BUG_ON(group[grouplen] != '\0');
-        if (active_stack->sp_ops->hangup)
+        ocfs2_leave_group(group);
-                active_stack->sp_ops->hangup(group, grouplen);
        /* cluster_disconnect() was called with hangup_pending==1 */
        ocfs2_stack_driver_put();
@@ -548,10 +586,83 @@ error:
        return ret;
 }
+/*
+ * Sysctl bits
+ *
+ * The sysctl lives at /proc/sys/fs/ocfs2/nm/hb_ctl_path.  The 'nm' doesn't
+ * make as much sense in a multiple cluster stack world, but it's safer
+ * and easier to preserve the name.
+ */
+#define FS_OCFS2_NM             1
+static ctl_table ocfs2_nm_table[] = {
+        {
+                .ctl_name       = 1,
+                .procname       = "hb_ctl_path",
+                .data           = ocfs2_hb_ctl_path,
+                .maxlen         = OCFS2_MAX_HB_CTL_PATH,
+                .mode           = 0644,
+                .proc_handler   = &proc_dostring,
+                .strategy       = &sysctl_string,
+        },
+        { .ctl_name = 0 }
+};
+static ctl_table ocfs2_mod_table[] = {
+        {
+                .ctl_name       = FS_OCFS2_NM,
+                .procname       = "nm",
+                .data           = NULL,
+                .maxlen         = 0,
+                .mode           = 0555,
+                .child          = ocfs2_nm_table
+        },
+        { .ctl_name = 0}
+};
+static ctl_table ocfs2_kern_table[] = {
+        {
+                .ctl_name       = FS_OCFS2,
+                .procname       = "ocfs2",
+                .data           = NULL,
+                .maxlen         = 0,
+                .mode           = 0555,
+                .child          = ocfs2_mod_table
+        },
+        { .ctl_name = 0}
+};
+static ctl_table ocfs2_root_table[] = {
+        {
+                .ctl_name       = CTL_FS,
+                .procname       = "fs",
+                .data           = NULL,
+                .maxlen         = 0,
+                .mode           = 0555,
+                .child          = ocfs2_kern_table
+        },
+        { .ctl_name = 0 }
+};
+static struct ctl_table_header *ocfs2_table_header = NULL;
+/*
+ * Initialization
+ */
 static int __init ocfs2_stack_glue_init(void)
 {
        strcpy(cluster_stack_name, OCFS2_STACK_PLUGIN_O2CB);
+        ocfs2_table_header = register_sysctl_table(ocfs2_root_table);
+        if (!ocfs2_table_header) {
+                printk(KERN_ERR
+                       "ocfs2 stack glue: unable to register sysctl\n");
+                return -ENOMEM; /* or something. */
+        }
        return ocfs2_sysfs_init();
 }
@@ -559,6 +670,8 @@ static void __exit ocfs2_stack_glue_exit(void)
 {
        lproto = NULL;
        ocfs2_sysfs_exit();
+        if (ocfs2_table_header)
+                unregister_sysctl_table(ocfs2_table_header);
 }
 MODULE_AUTHOR("Oracle");
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index 005e4f170e0f..db56281dd1be 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -134,22 +134,10 @@ struct ocfs2_stack_operations {
         * be freed.  Thus, a stack must not return from ->disconnect()
         * until it will no longer reference the conn pointer.
         *
-         * If hangup_pending is zero, ocfs2_cluster_disconnect() will also
+         * Once this call returns, the stack glue will be dropping this
-         * be dropping the reference on the module.
+         * connection's reference on the module.
         */
-        int (*disconnect)(struct ocfs2_cluster_connection *conn,
+        int (*disconnect)(struct ocfs2_cluster_connection *conn);
-                          int hangup_pending);
-        /*
-         * ocfs2_cluster_hangup() exists for compatibility with older
-         * ocfs2 tools.  Only the classic stack really needs it.  As such
-         * ->hangup() is not required of all stacks.  See the comment by
-         * ocfs2_cluster_hangup() for more details.
-         *
-         * Note that ocfs2_cluster_hangup() can only be called if
-         * hangup_pending was passed to ocfs2_cluster_disconnect().
-         */
-        void (*hangup)(const char *group, int grouplen);
        /*
         * ->this_node() returns the cluster's unique identifier for the
@@ -258,4 +246,5 @@ void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto)
 /* Used by stack plugins */
 int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin);
 void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin);
 #endif  /* STACKGLUE_H */
diff --git a/fs/open.c b/fs/open.c
index a1450086e92f..a99ad09c3197 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -16,6 +16,7 @@
 #include <linux/namei.h>
 #include <linux/backing-dev.h>
 #include <linux/capability.h>
+#include <linux/securebits.h>
 #include <linux/security.h>
 #include <linux/mount.h>
 #include <linux/vfs.h>
@@ -425,7 +426,7 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
 {
        struct nameidata nd;
        int old_fsuid, old_fsgid;
-        kernel_cap_t old_cap;
+        kernel_cap_t uninitialized_var(old_cap);  /* !SECURE_NO_SETUID_FIXUP */
        int res;
        if (mode & ~S_IRWXO)    /* where's F_OK, X_OK, W_OK, R_OK? */
@@ -433,23 +434,27 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
        old_fsuid = current->fsuid;
        old_fsgid = current->fsgid;
-        old_cap = current->cap_effective;
        current->fsuid = current->uid;
        current->fsgid = current->gid;
-        /*
+        if (!issecure(SECURE_NO_SETUID_FIXUP)) {
-         * Clear the capabilities if we switch to a non-root user
+                /*
-         *
+                 * Clear the capabilities if we switch to a non-root user
-         * FIXME: There is a race here against sys_capset.  The
+                 */
-         * capabilities can change yet we will restore the old
+#ifndef CONFIG_SECURITY_FILE_CAPABILITIES
-         * value below.  We should hold task_capabilities_lock,
+                /*
-         * but we cannot because user_path_walk can sleep.
+                 * FIXME: There is a race here against sys_capset.  The
-         */
+                 * capabilities can change yet we will restore the old
-        if (current->uid)
+                 * value below.  We should hold task_capabilities_lock,
-                cap_clear(current->cap_effective);
+                 * but we cannot because user_path_walk can sleep.
-        else
+                 */
-                current->cap_effective = current->cap_permitted;
+#endif /* ndef CONFIG_SECURITY_FILE_CAPABILITIES */
+                if (current->uid)
+                        old_cap = cap_set_effective(__cap_empty_set);
+                else
+                        old_cap = cap_set_effective(current->cap_permitted);
+        }
        res = __user_walk_fd(dfd, filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd);
        if (res)
@@ -478,7 +483,9 @@ out_path_release:
 out:
        current->fsuid = old_fsuid;
        current->fsgid = old_fsgid;
-        current->cap_effective = old_cap;
+        if (!issecure(SECURE_NO_SETUID_FIXUP))
+                cap_set_effective(old_cap);
        return res;
 }
diff --git a/fs/pipe.c b/fs/pipe.c
index ec228bc9f882..700f4e0d9572 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1003,8 +1003,7 @@ struct file *create_write_pipe(void)
 void free_write_pipe(struct file *f)
 {
        free_pipe_info(f->f_dentry->d_inode);
-        dput(f->f_path.dentry);
+        path_put(&f->f_path);
-        mntput(f->f_path.mnt);
        put_filp(f);
 }
@@ -1015,8 +1014,8 @@ struct file *create_read_pipe(struct file *wrf)
                return ERR_PTR(-ENFILE);
        /* Grab pipe from the writer */
-        f->f_path.mnt = mntget(wrf->f_path.mnt);
+        f->f_path = wrf->f_path;
-        f->f_path.dentry = dget(wrf->f_path.dentry);
+        path_get(&wrf->f_path);
        f->f_mapping = wrf->f_path.dentry->d_inode->i_mapping;
        f->f_pos = 0;
@@ -1068,8 +1067,7 @@ int do_pipe(int *fd)
 err_fdr:
        put_unused_fd(fdr);
 err_read_pipe:
-        dput(fr->f_dentry);
+        path_put(&fr->f_path);
-        mntput(fr->f_vfsmnt);
        put_filp(fr);
 err_write_pipe:
        free_write_pipe(fw);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3b455371e7ff..58c3e6a8e15e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -233,7 +233,7 @@ static int check_mem_permission(struct task_struct *task)
         */
        if (task->parent == current && (task->ptrace & PT_PTRACED) &&
            task_is_stopped_or_traced(task) &&
-            ptrace_may_attach(task))
+            ptrace_may_access(task, PTRACE_MODE_ATTACH))
                return 0;
        /*
@@ -251,7 +251,8 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
        task_lock(task);
        if (task->mm != mm)
                goto out;
-        if (task->mm != current->mm && __ptrace_may_attach(task) < 0)
+        if (task->mm != current->mm &&
+            __ptrace_may_access(task, PTRACE_MODE_READ) < 0)
                goto out;
        task_unlock(task);
        return mm;
@@ -518,7 +519,7 @@ static int proc_fd_access_allowed(struct inode *inode)
         */
        task = get_proc_task(inode);
        if (task) {
-                allowed = ptrace_may_attach(task);
+                allowed = ptrace_may_access(task, PTRACE_MODE_READ);
                put_task_struct(task);
        }
        return allowed;
@@ -904,7 +905,7 @@ static ssize_t environ_read(struct file *file, char __user *buf,
        if (!task)
                goto out_no_task;
-        if (!ptrace_may_attach(task))
+        if (!ptrace_may_access(task, PTRACE_MODE_READ))
                goto out;
        ret = -ENOMEM;
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 7e277f2ad466..c652d469dc08 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -123,6 +123,11 @@ static int uptime_read_proc(char *page, char **start, off_t off,
        return proc_calc_metrics(page, start, off, count, eof, len);
 }
+int __attribute__((weak)) arch_report_meminfo(char *page)
+{
+        return 0;
+}
 static int meminfo_read_proc(char *page, char **start, off_t off,
                                 int count, int *eof, void *data)
 {
@@ -221,6 +226,8 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
                len += hugetlb_report_meminfo(page + len);
+        len += arch_report_meminfo(page + len);
        return proc_calc_metrics(page, start, off, count, eof, len);
 #undef K
 }
@@ -472,6 +479,13 @@ static const struct file_operations proc_vmalloc_operations = {
 };
 #endif
+#ifndef arch_irq_stat_cpu
+#define arch_irq_stat_cpu(cpu) 0
+#endif
+#ifndef arch_irq_stat
+#define arch_irq_stat() 0
+#endif
 static int show_stat(struct seq_file *p, void *v)
 {
        int i;
@@ -509,7 +523,9 @@ static int show_stat(struct seq_file *p, void *v)
                        sum += temp;
                        per_irq_sum[j] += temp;
                }
+                sum += arch_irq_stat_cpu(i);
        }
+        sum += arch_irq_stat();
        seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
                (unsigned long long)cputime64_to_clock_t(user),
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 17403629e330..164bd9f9ede3 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -210,7 +210,7 @@ static int show_map(struct seq_file *m, void *v)
        dev_t dev = 0;
        int len;
-        if (maps_protect && !ptrace_may_attach(task))
+        if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
                return -EACCES;
        if (file) {
@@ -315,9 +315,9 @@ struct mem_size_stats {
 };
 static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
-                           void *private)
+                           struct mm_walk *walk)
 {
-        struct mem_size_stats *mss = private;
+        struct mem_size_stats *mss = walk->private;
        struct vm_area_struct *vma = mss->vma;
        pte_t *pte, ptent;
        spinlock_t *ptl;
@@ -365,19 +365,21 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
        return 0;
 }
-static struct mm_walk smaps_walk = { .pmd_entry = smaps_pte_range };
 static int show_smap(struct seq_file *m, void *v)
 {
        struct vm_area_struct *vma = v;
        struct mem_size_stats mss;
        int ret;
+        struct mm_walk smaps_walk = {
+                .pmd_entry = smaps_pte_range,
+                .mm = vma->vm_mm,
+                .private = &mss,
+        };
        memset(&mss, 0, sizeof mss);
        mss.vma = vma;
        if (vma->vm_mm && !is_vm_hugetlb_page(vma))
-                walk_page_range(vma->vm_mm, vma->vm_start, vma->vm_end,
+                walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
-                                &smaps_walk, &mss);
        ret = show_map(m, v);
        if (ret)
@@ -426,9 +428,9 @@ const struct file_operations proc_smaps_operations = {
 };
 static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
-                                unsigned long end, void *private)
+                                unsigned long end, struct mm_walk *walk)
 {
-        struct vm_area_struct *vma = private;
+        struct vm_area_struct *vma = walk->private;
        pte_t *pte, ptent;
        spinlock_t *ptl;
        struct page *page;
@@ -452,8 +454,6 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
        return 0;
 }
-static struct mm_walk clear_refs_walk = { .pmd_entry = clear_refs_pte_range };
 static ssize_t clear_refs_write(struct file *file, const char __user *buf,
                                size_t count, loff_t *ppos)
 {
@@ -476,11 +476,17 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
                return -ESRCH;
        mm = get_task_mm(task);
        if (mm) {
+                struct mm_walk clear_refs_walk = {
+                        .pmd_entry = clear_refs_pte_range,
+                        .mm = mm,
+                };
                down_read(&mm->mmap_sem);
-                for (vma = mm->mmap; vma; vma = vma->vm_next)
+                for (vma = mm->mmap; vma; vma = vma->vm_next) {
+                        clear_refs_walk.private = vma;
                        if (!is_vm_hugetlb_page(vma))
-                                walk_page_range(mm, vma->vm_start, vma->vm_end,
+                                walk_page_range(vma->vm_start, vma->vm_end,
-                                                &clear_refs_walk, vma);
+                                                &clear_refs_walk);
+                }
                flush_tlb_mm(mm);
                up_read(&mm->mmap_sem);
                mmput(mm);
@@ -528,9 +534,9 @@ static int add_to_pagemap(unsigned long addr, u64 pfn,
 }
 static int pagemap_pte_hole(unsigned long start, unsigned long end,
-                                void *private)
+                                struct mm_walk *walk)
 {
-        struct pagemapread *pm = private;
+        struct pagemapread *pm = walk->private;
        unsigned long addr;
        int err = 0;
        for (addr = start; addr < end; addr += PAGE_SIZE) {
@@ -547,24 +553,45 @@ static u64 swap_pte_to_pagemap_entry(pte_t pte)
        return swp_type(e) | (swp_offset(e) << MAX_SWAPFILES_SHIFT);
 }
+static unsigned long pte_to_pagemap_entry(pte_t pte)
+{
+        unsigned long pme = 0;
+        if (is_swap_pte(pte))
+                pme = PM_PFRAME(swap_pte_to_pagemap_entry(pte))
+                        | PM_PSHIFT(PAGE_SHIFT) | PM_SWAP;
+        else if (pte_present(pte))
+                pme = PM_PFRAME(pte_pfn(pte))
+                        | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT;
+        return pme;
+}
 static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
-                             void *private)
+                             struct mm_walk *walk)
 {
-        struct pagemapread *pm = private;
+        struct vm_area_struct *vma;
+        struct pagemapread *pm = walk->private;
        pte_t *pte;
        int err = 0;
+        /* find the first VMA at or above 'addr' */
+        vma = find_vma(walk->mm, addr);
        for (; addr != end; addr += PAGE_SIZE) {
                u64 pfn = PM_NOT_PRESENT;
-                pte = pte_offset_map(pmd, addr);
-                if (is_swap_pte(*pte))
+                /* check to see if we've left 'vma' behind
-                        pfn = PM_PFRAME(swap_pte_to_pagemap_entry(*pte))
+                 * and need a new, higher one */
-                                | PM_PSHIFT(PAGE_SHIFT) | PM_SWAP;
+                if (vma && (addr >= vma->vm_end))
-                else if (pte_present(*pte))
+                        vma = find_vma(walk->mm, addr);
-                        pfn = PM_PFRAME(pte_pfn(*pte))
-                                | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT;
+                /* check that 'vma' actually covers this address,
-                /* unmap so we're not in atomic when we copy to userspace */
+                 * and that it isn't a huge page vma */
-                pte_unmap(pte);
+                if (vma && (vma->vm_start <= addr) &&
+                    !is_vm_hugetlb_page(vma)) {
+                        pte = pte_offset_map(pmd, addr);
+                        pfn = pte_to_pagemap_entry(*pte);
+                        /* unmap before userspace copy */
+                        pte_unmap(pte);
+                }
                err = add_to_pagemap(addr, pfn, pm);
                if (err)
                        return err;
@@ -575,11 +602,6 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
        return err;
 }
-static struct mm_walk pagemap_walk = {
-        .pmd_entry = pagemap_pte_range,
-        .pte_hole = pagemap_pte_hole
-};
 /*
 * /proc/pid/pagemap - an array mapping virtual pages to pfns
 *
@@ -614,12 +636,17 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
        struct pagemapread pm;
        int pagecount;
        int ret = -ESRCH;
+        struct mm_walk pagemap_walk;
+        unsigned long src;
+        unsigned long svpfn;
+        unsigned long start_vaddr;
+        unsigned long end_vaddr;
        if (!task)
                goto out;
        ret = -EACCES;
-        if (!ptrace_may_attach(task))
+        if (!ptrace_may_access(task, PTRACE_MODE_READ))
                goto out_task;
        ret = -EINVAL;
@@ -632,11 +659,15 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
        if (!mm)
                goto out_task;
-        ret = -ENOMEM;
        uaddr = (unsigned long)buf & PAGE_MASK;
        uend = (unsigned long)(buf + count);
        pagecount = (PAGE_ALIGN(uend) - uaddr) / PAGE_SIZE;
-        pages = kmalloc(pagecount * sizeof(struct page *), GFP_KERNEL);
+        ret = 0;
+        if (pagecount == 0)
+                goto out_mm;
+        pages = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL);
+        ret = -ENOMEM;
        if (!pages)
                goto out_mm;
@@ -657,33 +688,33 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
        pm.out = (u64 *)buf;
        pm.end = (u64 *)(buf + count);
-        if (!ptrace_may_attach(task)) {
+        pagemap_walk.pmd_entry = pagemap_pte_range;
-                ret = -EIO;
+        pagemap_walk.pte_hole = pagemap_pte_hole;
-        } else {
+        pagemap_walk.mm = mm;
-                unsigned long src = *ppos;
+        pagemap_walk.private = &pm;
-                unsigned long svpfn = src / PM_ENTRY_BYTES;
-                unsigned long start_vaddr = svpfn << PAGE_SHIFT;
+        src = *ppos;
-                unsigned long end_vaddr = TASK_SIZE_OF(task);
+        svpfn = src / PM_ENTRY_BYTES;
+        start_vaddr = svpfn << PAGE_SHIFT;
-                /* watch out for wraparound */
+        end_vaddr = TASK_SIZE_OF(task);
-                if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT)
-                        start_vaddr = end_vaddr;
+        /* watch out for wraparound */
+        if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT)
-                /*
+                start_vaddr = end_vaddr;
-                 * The odds are that this will stop walking way
-                 * before end_vaddr, because the length of the
+        /*
-                 * user buffer is tracked in "pm", and the walk
+         * The odds are that this will stop walking way
-                 * will stop when we hit the end of the buffer.
+         * before end_vaddr, because the length of the
-                 */
+         * user buffer is tracked in "pm", and the walk
-                ret = walk_page_range(mm, start_vaddr, end_vaddr,
+         * will stop when we hit the end of the buffer.
-                                        &pagemap_walk, &pm);
+         */
-                if (ret == PM_END_OF_BUFFER)
+        ret = walk_page_range(start_vaddr, end_vaddr, &pagemap_walk);
-                        ret = 0;
+        if (ret == PM_END_OF_BUFFER)
-                /* don't need mmap_sem for these, but this looks cleaner */
+                ret = 0;
-                *ppos += (char *)pm.out - buf;
+        /* don't need mmap_sem for these, but this looks cleaner */
-                if (!ret)
+        *ppos += (char *)pm.out - buf;
-                        ret = (char *)pm.out - buf;
+        if (!ret)
-        }
+                ret = (char *)pm.out - buf;
 out_pages:
        for (; pagecount; pagecount--) {
@@ -716,7 +747,7 @@ static int show_numa_map_checked(struct seq_file *m, void *v)
        struct proc_maps_private *priv = m->private;
        struct task_struct *task = priv->task;
-        if (maps_protect && !ptrace_may_attach(task))
+        if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
                return -EACCES;
        return show_numa_map(m, v);
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 4b4f9cc2f186..5d84e7121df8 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -113,7 +113,7 @@ static int show_map(struct seq_file *m, void *_vml)
        struct proc_maps_private *priv = m->private;
        struct task_struct *task = priv->task;
-        if (maps_protect && !ptrace_may_attach(task))
+        if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
                return -EACCES;
        return nommu_vma_show(m, vml->vma);
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 9590b9024300..78f613cb9c76 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -45,6 +45,7 @@ const struct file_operations ramfs_file_operations = {
        .mmap           = generic_file_mmap,
        .fsync          = simple_sync_file,
        .splice_read    = generic_file_splice_read,
+        .splice_write   = generic_file_splice_write,
        .llseek         = generic_file_llseek,
 };
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 0989bc2c2f69..52312ec93ff4 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -43,6 +43,7 @@ const struct file_operations ramfs_file_operations = {
        .aio_write              = generic_file_aio_write,
        .fsync                  = simple_sync_file,
        .splice_read            = generic_file_splice_read,
+        .splice_write           = generic_file_splice_write,
        .llseek                 = generic_file_llseek,
 };
diff --git a/fs/read_write.c b/fs/read_write.c
index f0d1240a5c69..9ba495d5a29b 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -31,12 +31,12 @@ const struct file_operations generic_ro_fops = {
 EXPORT_SYMBOL(generic_ro_fops);
-loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
+loff_t
+generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
 {
        loff_t retval;
        struct inode *inode = file->f_mapping->host;
-        mutex_lock(&inode->i_mutex);
        switch (origin) {
                case SEEK_END:
                        offset += inode->i_size;
@@ -46,42 +46,26 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
        }
        retval = -EINVAL;
        if (offset>=0 && offset<=inode->i_sb->s_maxbytes) {
+                /* Special lock needed here? */
                if (offset != file->f_pos) {
                        file->f_pos = offset;
                        file->f_version = 0;
                }
                retval = offset;
        }
-        mutex_unlock(&inode->i_mutex);
        return retval;
 }
+EXPORT_SYMBOL(generic_file_llseek_unlocked);
-EXPORT_SYMBOL(generic_file_llseek);
+loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
-loff_t remote_llseek(struct file *file, loff_t offset, int origin)
 {
-        loff_t retval;
+        loff_t n;
+        mutex_lock(&file->f_dentry->d_inode->i_mutex);
-        lock_kernel();
+        n = generic_file_llseek_unlocked(file, offset, origin);
-        switch (origin) {
+        mutex_unlock(&file->f_dentry->d_inode->i_mutex);
-                case SEEK_END:
+        return n;
-                        offset += i_size_read(file->f_path.dentry->d_inode);
-                        break;
-                case SEEK_CUR:
-                        offset += file->f_pos;
-        }
-        retval = -EINVAL;
-        if (offset>=0 && offset<=file->f_path.dentry->d_inode->i_sb->s_maxbytes) {
-                if (offset != file->f_pos) {
-                        file->f_pos = offset;
-                        file->f_version = 0;
-                }
-                retval = offset;
-        }
-        unlock_kernel();
-        return retval;
 }
-EXPORT_SYMBOL(remote_llseek);
+EXPORT_SYMBOL(generic_file_llseek);
 loff_t no_llseek(struct file *file, loff_t offset, int origin)
 {
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 57917932212e..192269698a8a 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -45,6 +45,8 @@ void reiserfs_delete_inode(struct inode *inode)
                        goto out;
                reiserfs_update_inode_transaction(inode);
+                reiserfs_discard_prealloc(&th, inode);
                err = reiserfs_delete_object(&th, inode);
                /* Do quota update inside a transaction for journaled quotas. We must do that
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index ed424d708e69..1d40f2bd1970 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -2165,8 +2165,10 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
                blk++;
        }
 out:
-        if (len == towrite)
+        if (len == towrite) {
+                mutex_unlock(&inode->i_mutex);
                return err;
+        }
        if (inode->i_size < off + len - towrite)
                i_size_write(inode, off + len - towrite);
        inode->i_version++;
diff --git a/fs/select.c b/fs/select.c
index 8dda969614a9..da0e88201c3a 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -249,7 +249,6 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout)
                                                retval++;
                                        }
                                }
-                                cond_resched();
                        }
                        if (res_in)
                                *rinp = res_in;
@@ -257,6 +256,7 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout)
                                *routp = res_out;
                        if (res_ex)
                                *rexp = res_ex;
+                        cond_resched();
                }
                wait = NULL;
                if (retval || !*timeout || signal_pending(current))
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index efbe29af3d7a..2294783320cb 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -422,9 +422,18 @@ smb_file_permission(struct inode *inode, int mask, struct nameidata *nd)
        return error;
 }
+static loff_t smb_remote_llseek(struct file *file, loff_t offset, int origin)
+{
+        loff_t ret;
+        lock_kernel();
+        ret = generic_file_llseek_unlocked(file, offset, origin);
+        unlock_kernel();
+        return ret;
+}
 const struct file_operations smb_file_operations =
 {
-        .llseek         = remote_llseek,
+        .llseek         = smb_remote_llseek,
        .read           = do_sync_read,
        .aio_read       = smb_file_aio_read,
        .write          = do_sync_write,
diff --git a/fs/splice.c b/fs/splice.c
index aa5f6f60b305..399442179d89 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -379,13 +379,22 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
                                lock_page(page);
                        /*
-                         * page was truncated, stop here. if this isn't the
+                         * Page was truncated, or invalidated by the
-                         * first page, we'll just complete what we already
+                         * filesystem.  Redo the find/create, but this time the
-                         * added
+                         * page is kept locked, so there's no chance of another
+                         * race with truncate/invalidate.
                         */
                        if (!page->mapping) {
                                unlock_page(page);
-                                break;
+                                page = find_or_create_page(mapping, index,
+                                                mapping_gfp_mask(mapping));
+                                if (!page) {
+                                        error = -ENOMEM;
+                                        break;
+                                }
+                                page_cache_release(pages[page_nr]);
+                                pages[page_nr] = page;
                        }
                        /*
                         * page was already under io and is now done, great
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
new file mode 100644
index 000000000000..91ceeda7e5bf
--- /dev/null
+++ b/fs/ubifs/Kconfig
@@ -0,0 +1,72 @@
+config UBIFS_FS
+        tristate "UBIFS file system support"
+        select CRC16
+        select CRC32
+        select CRYPTO if UBIFS_FS_ADVANCED_COMPR
+        select CRYPTO if UBIFS_FS_LZO
+        select CRYPTO if UBIFS_FS_ZLIB
+        select CRYPTO_LZO if UBIFS_FS_LZO
+        select CRYPTO_DEFLATE if UBIFS_FS_ZLIB
+        depends on MTD_UBI
+        help
+          UBIFS is a file system for flash devices which works on top of UBI.
+config UBIFS_FS_XATTR
+        bool "Extended attributes support"
+        depends on UBIFS_FS
+        help
+          This option enables support of extended attributes.
+config UBIFS_FS_ADVANCED_COMPR
+        bool "Advanced compression options"
+        depends on UBIFS_FS
+        help
+          This option allows to explicitly choose which compressions, if any,
+          are enabled in UBIFS. Removing compressors means inbility to read
+          existing file systems.
+          If unsure, say 'N'.
+config UBIFS_FS_LZO
+        bool "LZO compression support" if UBIFS_FS_ADVANCED_COMPR
+        depends on UBIFS_FS
+        default y
+        help
+           LZO compressor is generally faster then zlib but compresses worse.
+           Say 'Y' if unsure.
+config UBIFS_FS_ZLIB
+        bool "ZLIB compression support" if UBIFS_FS_ADVANCED_COMPR
+        depends on UBIFS_FS
+        default y
+        help
+          Zlib copresses better then LZO but it is slower. Say 'Y' if unsure.
+# Debugging-related stuff
+config UBIFS_FS_DEBUG
+        bool "Enable debugging"
+        depends on UBIFS_FS
+        select DEBUG_FS
+        select KALLSYMS_ALL
+        help
+          This option enables UBIFS debugging.
+config UBIFS_FS_DEBUG_MSG_LVL
+        int "Default message level (0 = no extra messages, 3 = lots)"
+        depends on UBIFS_FS_DEBUG
+        default "0"
+        help
+          This controls the amount of debugging messages produced by UBIFS.
+          If reporting bugs, please try to have available a full dump of the
+          messages at level 1 while the misbehaviour was occurring. Level 2
+          may become necessary if level 1 messages were not enough to find the
+          bug. Generally Level 3 should be avoided.
+config UBIFS_FS_DEBUG_CHKS
+        bool "Enable extra checks"
+        depends on UBIFS_FS_DEBUG
+        help
+          If extra checks are enabled UBIFS will check the consistency of its
+          internal data structures during operation. However, UBIFS performance
+          is dramatically slower when this option is selected especially if the
+          file system is large.
diff --git a/fs/ubifs/Makefile b/fs/ubifs/Makefile
new file mode 100644
index 000000000000..80e93c35e496
--- /dev/null
+++ b/fs/ubifs/Makefile
@@ -0,0 +1,9 @@
+obj-$(CONFIG_UBIFS_FS) += ubifs.o
+ubifs-y += shrinker.o journal.o file.o dir.o super.o sb.o io.o
+ubifs-y += tnc.o master.o scan.o replay.o log.o commit.o gc.o orphan.o
+ubifs-y += budget.o find.o tnc_commit.o compress.o lpt.o lprops.o
+ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o
+ubifs-$(CONFIG_UBIFS_FS_DEBUG) += debug.o
+ubifs-$(CONFIG_UBIFS_FS_XATTR) += xattr.o
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
new file mode 100644
index 000000000000..d81fb9ed2b8e
--- /dev/null
+++ b/fs/ubifs/budget.c
@@ -0,0 +1,731 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+/*
+ * This file implements the budgeting sub-system which is responsible for UBIFS
+ * space management.
+ *
+ * Factors such as compression, wasted space at the ends of LEBs, space in other
+ * journal heads, the effect of updates on the index, and so on, make it
+ * impossible to accurately predict the amount of space needed. Consequently
+ * approximations are used.
+ */
+#include "ubifs.h"
+#include <linux/writeback.h>
+#include <asm/div64.h>
+/*
+ * When pessimistic budget calculations say that there is no enough space,
+ * UBIFS starts writing back dirty inodes and pages, doing garbage collection,
+ * or committing. The below constants define maximum number of times UBIFS
+ * repeats the operations.
+ */
+#define MAX_SHRINK_RETRIES 8
+#define MAX_GC_RETRIES     4
+#define MAX_CMT_RETRIES    2
+#define MAX_NOSPC_RETRIES  1
+/*
+ * The below constant defines amount of dirty pages which should be written
+ * back at when trying to shrink the liability.
+ */
+#define NR_TO_WRITE 16
+/**
+ * struct retries_info - information about re-tries while making free space.
+ * @prev_liability: previous liability
+ * @shrink_cnt: how many times the liability was shrinked
+ * @shrink_retries: count of liability shrink re-tries (increased when
+ *                  liability does not shrink)
+ * @try_gc: GC should be tried first
+ * @gc_retries: how many times GC was run
+ * @cmt_retries: how many times commit has been done
+ * @nospc_retries: how many times GC returned %-ENOSPC
+ *
+ * Since we consider budgeting to be the fast-path, and this structure has to
+ * be allocated on stack and zeroed out, we make it smaller using bit-fields.
+ */
+struct retries_info {
+        long long prev_liability;
+        unsigned int shrink_cnt;
+        unsigned int shrink_retries:5;
+        unsigned int try_gc:1;
+        unsigned int gc_retries:4;
+        unsigned int cmt_retries:3;
+        unsigned int nospc_retries:1;
+};
+/**
+ * shrink_liability - write-back some dirty pages/inodes.
+ * @c: UBIFS file-system description object
+ * @nr_to_write: how many dirty pages to write-back
+ *
+ * This function shrinks UBIFS liability by means of writing back some amount
+ * of dirty inodes and their pages. Returns the amount of pages which were
+ * written back. The returned value does not include dirty inodes which were
+ * synchronized.
+ *
+ * Note, this function synchronizes even VFS inodes which are locked
+ * (@i_mutex) by the caller of the budgeting function, because write-back does
+ * not touch @i_mutex.
+ */
+static int shrink_liability(struct ubifs_info *c, int nr_to_write)
+{
+        int nr_written;
+        struct writeback_control wbc = {
+                .sync_mode   = WB_SYNC_NONE,
+                .range_end   = LLONG_MAX,
+                .nr_to_write = nr_to_write,
+        };
+        generic_sync_sb_inodes(c->vfs_sb, &wbc);
+        nr_written = nr_to_write - wbc.nr_to_write;
+        if (!nr_written) {
+                /*
+                 * Re-try again but wait on pages/inodes which are being
+                 * written-back concurrently (e.g., by pdflush).
+                 */
+                memset(&wbc, 0, sizeof(struct writeback_control));
+                wbc.sync_mode   = WB_SYNC_ALL;
+                wbc.range_end   = LLONG_MAX;
+                wbc.nr_to_write = nr_to_write;
+                generic_sync_sb_inodes(c->vfs_sb, &wbc);
+                nr_written = nr_to_write - wbc.nr_to_write;
+        }
+        dbg_budg("%d pages were written back", nr_written);
+        return nr_written;
+}
+/**
+ * run_gc - run garbage collector.
+ * @c: UBIFS file-system description object
+ *
+ * This function runs garbage collector to make some more free space. Returns
+ * zero if a free LEB has been produced, %-EAGAIN if commit is required, and a
+ * negative error code in case of failure.
+ */
+static int run_gc(struct ubifs_info *c)
+{
+        int err, lnum;
+        /* Make some free space by garbage-collecting dirty space */
+        down_read(&c->commit_sem);
+        lnum = ubifs_garbage_collect(c, 1);
+        up_read(&c->commit_sem);
+        if (lnum < 0)
+                return lnum;
+        /* GC freed one LEB, return it to lprops */
+        dbg_budg("GC freed LEB %d", lnum);
+        err = ubifs_return_leb(c, lnum);
+        if (err)
+                return err;
+        return 0;
+}
+/**
+ * make_free_space - make more free space on the file-system.
+ * @c: UBIFS file-system description object
+ * @ri: information about previous invocations of this function
+ *
+ * This function is called when an operation cannot be budgeted because there
+ * is supposedly no free space. But in most cases there is some free space:
+ *   o budgeting is pessimistic, so it always budgets more then it is actually
+ *     needed, so shrinking the liability is one way to make free space - the
+ *     cached data will take less space then it was budgeted for;
+ *   o GC may turn some dark space into free space (budgeting treats dark space
+ *     as not available);
+ *   o commit may free some LEB, i.e., turn freeable LEBs into free LEBs.
+ *
+ * So this function tries to do the above. Returns %-EAGAIN if some free space
+ * was presumably made and the caller has to re-try budgeting the operation.
+ * Returns %-ENOSPC if it couldn't do more free space, and other negative error
+ * codes on failures.
+ */
+static int make_free_space(struct ubifs_info *c, struct retries_info *ri)
+{
+        int err;
+        /*
+         * If we have some dirty pages and inodes (liability), try to write
+         * them back unless this was tried too many times without effect
+         * already.
+         */
+        if (ri->shrink_retries < MAX_SHRINK_RETRIES && !ri->try_gc) {
+                long long liability;
+                spin_lock(&c->space_lock);
+                liability = c->budg_idx_growth + c->budg_data_growth +
+                            c->budg_dd_growth;
+                spin_unlock(&c->space_lock);
+                if (ri->prev_liability >= liability) {
+                        /* Liability does not shrink, next time try GC then */
+                        ri->shrink_retries += 1;
+                        if (ri->gc_retries < MAX_GC_RETRIES)
+                                ri->try_gc = 1;
+                        dbg_budg("liability did not shrink: retries %d of %d",
+                                 ri->shrink_retries, MAX_SHRINK_RETRIES);
+                }
+                dbg_budg("force write-back (count %d)", ri->shrink_cnt);
+                shrink_liability(c, NR_TO_WRITE + ri->shrink_cnt);
+                ri->prev_liability = liability;
+                ri->shrink_cnt += 1;
+                return -EAGAIN;
+        }
+        /*
+         * Try to run garbage collector unless it was already tried too many
+         * times.
+         */
+        if (ri->gc_retries < MAX_GC_RETRIES) {
+                ri->gc_retries += 1;
+                dbg_budg("run GC, retries %d of %d",
+                         ri->gc_retries, MAX_GC_RETRIES);
+                ri->try_gc = 0;
+                err = run_gc(c);
+                if (!err)
+                        return -EAGAIN;
+                if (err == -EAGAIN) {
+                        dbg_budg("GC asked to commit");
+                        err = ubifs_run_commit(c);
+                        if (err)
+                                return err;
+                        return -EAGAIN;
+                }
+                if (err != -ENOSPC)
+                        return err;
+                /*
+                 * GC could not make any progress. If this is the first time,
+                 * then it makes sense to try to commit, because it might make
+                 * some dirty space.
+                 */
+                dbg_budg("GC returned -ENOSPC, retries %d",
+                         ri->nospc_retries);
+                if (ri->nospc_retries >= MAX_NOSPC_RETRIES)
+                        return err;
+                ri->nospc_retries += 1;
+        }
+        /* Neither GC nor write-back helped, try to commit */
+        if (ri->cmt_retries < MAX_CMT_RETRIES) {
+                ri->cmt_retries += 1;
+                dbg_budg("run commit, retries %d of %d",
+                         ri->cmt_retries, MAX_CMT_RETRIES);
+                err = ubifs_run_commit(c);
+                if (err)
+                        return err;
+                return -EAGAIN;
+        }
+        return -ENOSPC;
+}
+/**
+ * ubifs_calc_min_idx_lebs - calculate amount of eraseblocks for the index.
+ * @c: UBIFS file-system description object
+ *
+ * This function calculates and returns the number of eraseblocks which should
+ * be kept for index usage.
+ */
+int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
+{
+        int ret;
+        uint64_t idx_size;
+        idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx;
+        /* And make sure we have twice the index size of space reserved */
+        idx_size <<= 1;
+        /*
+         * We do not maintain 'old_idx_size' as 'old_idx_lebs'/'old_idx_bytes'
+         * pair, nor similarly the two variables for the new index size, so we
+         * have to do this costly 64-bit division on fast-path.
+         */
+        if (do_div(idx_size, c->leb_size - c->max_idx_node_sz))
+                ret = idx_size + 1;
+        else
+                ret = idx_size;
+        /*
+         * The index head is not available for the in-the-gaps method, so add an
+         * extra LEB to compensate.
+         */
+        ret += 1;
+        /*
+         * At present the index needs at least 2 LEBs: one for the index head
+         * and one for in-the-gaps method (which currently does not cater for
+         * the index head and so excludes it from consideration).
+         */
+        if (ret < 2)
+                ret = 2;
+        return ret;
+}
+/**
+ * ubifs_calc_available - calculate available FS space.
+ * @c: UBIFS file-system description object
+ * @min_idx_lebs: minimum number of LEBs reserved for the index
+ *
+ * This function calculates and returns amount of FS space available for use.
+ */
+long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs)
+{
+        int subtract_lebs;
+        long long available;
+        /*
+         * Force the amount available to the total size reported if the used
+         * space is zero.
+         */
+        if (c->lst.total_used <= UBIFS_INO_NODE_SZ &&
+            c->budg_data_growth + c->budg_dd_growth == 0) {
+                /* Do the same calculation as for c->block_cnt */
+                available = c->main_lebs - 2;
+                available *= c->leb_size - c->dark_wm;
+                return available;
+        }
+        available = c->main_bytes - c->lst.total_used;
+        /*
+         * Now 'available' contains theoretically available flash space
+         * assuming there is no index, so we have to subtract the space which
+         * is reserved for the index.
+         */
+        subtract_lebs = min_idx_lebs;
+        /* Take into account that GC reserves one LEB for its own needs */
+        subtract_lebs += 1;
+        /*
+         * The GC journal head LEB is not really accessible. And since
+         * different write types go to different heads, we may count only on
+         * one head's space.
+         */
+        subtract_lebs += c->jhead_cnt - 1;
+        /* We also reserve one LEB for deletions, which bypass budgeting */
+        subtract_lebs += 1;
+        available -= (long long)subtract_lebs * c->leb_size;
+        /* Subtract the dead space which is not available for use */
+        available -= c->lst.total_dead;
+        /*
+         * Subtract dark space, which might or might not be usable - it depends
+         * on the data which we have on the media and which will be written. If
+         * this is a lot of uncompressed or not-compressible data, the dark
+         * space cannot be used.
+         */
+        available -= c->lst.total_dark;
+        /*
+         * However, there is more dark space. The index may be bigger than
+         * @min_idx_lebs. Those extra LEBs are assumed to be available, but
+         * their dark space is not included in total_dark, so it is subtracted
+         * here.
+         */
+        if (c->lst.idx_lebs > min_idx_lebs) {
+                subtract_lebs = c->lst.idx_lebs - min_idx_lebs;
+                available -= subtract_lebs * c->dark_wm;
+        }
+        /* The calculations are rough and may end up with a negative number */
+        return available > 0 ? available : 0;
+}
+/**
+ * can_use_rp - check whether the user is allowed to use reserved pool.
+ * @c: UBIFS file-system description object
+ *
+ * UBIFS has so-called "reserved pool" which is flash space reserved
+ * for the superuser and for uses whose UID/GID is recorded in UBIFS superblock.
+ * This function checks whether current user is allowed to use reserved pool.
+ * Returns %1  current user is allowed to use reserved pool and %0 otherwise.
+ */
+static int can_use_rp(struct ubifs_info *c)
+{
+        if (current->fsuid == c->rp_uid || capable(CAP_SYS_RESOURCE) ||
+            (c->rp_gid != 0 && in_group_p(c->rp_gid)))
+                return 1;
+        return 0;
+}
+/**
+ * do_budget_space - reserve flash space for index and data growth.
+ * @c: UBIFS file-system description object
+ *
+ * This function makes sure UBIFS has enough free eraseblocks for index growth
+ * and data.
+ *
+ * When budgeting index space, UBIFS reserves twice as more LEBs as the index
+ * would take if it was consolidated and written to the flash. This guarantees
+ * that the "in-the-gaps" commit method always succeeds and UBIFS will always
+ * be able to commit dirty index. So this function basically adds amount of
+ * budgeted index space to the size of the current index, multiplies this by 2,
+ * and makes sure this does not exceed the amount of free eraseblocks.
+ *
+ * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables:
+ * o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might
+ *    be large, because UBIFS does not do any index consolidation as long as
+ *    there is free space. IOW, the index may take a lot of LEBs, but the LEBs
+ *    will contain a lot of dirt.
+ * o @c->min_idx_lebs is the the index presumably takes. IOW, the index may be
+ *   consolidated to take up to @c->min_idx_lebs LEBs.
+ *
+ * This function returns zero in case of success, and %-ENOSPC in case of
+ * failure.
+ */
+static int do_budget_space(struct ubifs_info *c)
+{
+        long long outstanding, available;
+        int lebs, rsvd_idx_lebs, min_idx_lebs;
+        /* First budget index space */
+        min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+        /* Now 'min_idx_lebs' contains number of LEBs to reserve */
+        if (min_idx_lebs > c->lst.idx_lebs)
+                rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs;
+        else
+                rsvd_idx_lebs = 0;
+        /*
+         * The number of LEBs that are available to be used by the index is:
+         *
+         *    @c->lst.empty_lebs + @c->freeable_cnt + @c->idx_gc_cnt -
+         *    @c->lst.taken_empty_lebs
+         *
+         * @empty_lebs are available because they are empty. @freeable_cnt are
+         * available because they contain only free and dirty space and the
+         * index allocation always occurs after wbufs are synch'ed.
+         * @idx_gc_cnt are available because they are index LEBs that have been
+         * garbage collected (including trivial GC) and are awaiting the commit
+         * before they can be unmapped - note that the in-the-gaps method will
+         * grab these if it needs them. @taken_empty_lebs are empty_lebs that
+         * have already been allocated for some purpose (also includes those
+         * LEBs on the @idx_gc list).
+         *
+         * Note, @taken_empty_lebs may temporarily be higher by one because of
+         * the way we serialize LEB allocations and budgeting. See a comment in
+         * 'ubifs_find_free_space()'.
+         */
+        lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
+               c->lst.taken_empty_lebs;
+        if (unlikely(rsvd_idx_lebs > lebs)) {
+                dbg_budg("out of indexing space: min_idx_lebs %d (old %d), "
+                         "rsvd_idx_lebs %d", min_idx_lebs, c->min_idx_lebs,
+                         rsvd_idx_lebs);
+                return -ENOSPC;
+        }
+        available = ubifs_calc_available(c, min_idx_lebs);
+        outstanding = c->budg_data_growth + c->budg_dd_growth;
+        if (unlikely(available < outstanding)) {
+                dbg_budg("out of data space: available %lld, outstanding %lld",
+                         available, outstanding);
+                return -ENOSPC;
+        }
+        if (available - outstanding <= c->rp_size && !can_use_rp(c))
+                return -ENOSPC;
+        c->min_idx_lebs = min_idx_lebs;
+        return 0;
+}
+/**
+ * calc_idx_growth - calculate approximate index growth from budgeting request.
+ * @c: UBIFS file-system description object
+ * @req: budgeting request
+ *
+ * For now we assume each new node adds one znode. But this is rather poor
+ * approximation, though.
+ */
+static int calc_idx_growth(const struct ubifs_info *c,
+                           const struct ubifs_budget_req *req)
+{
+        int znodes;
+        znodes = req->new_ino + (req->new_page << UBIFS_BLOCKS_PER_PAGE_SHIFT) +
+                 req->new_dent;
+        return znodes * c->max_idx_node_sz;
+}
+/**
+ * calc_data_growth - calculate approximate amount of new data from budgeting
+ * request.
+ * @c: UBIFS file-system description object
+ * @req: budgeting request
+ */
+static int calc_data_growth(const struct ubifs_info *c,
+                            const struct ubifs_budget_req *req)
+{
+        int data_growth;
+        data_growth = req->new_ino  ? c->inode_budget : 0;
+        if (req->new_page)
+                data_growth += c->page_budget;
+        if (req->new_dent)
+                data_growth += c->dent_budget;
+        data_growth += req->new_ino_d;
+        return data_growth;
+}
+/**
+ * calc_dd_growth - calculate approximate amount of data which makes other data
+ * dirty from budgeting request.
+ * @c: UBIFS file-system description object
+ * @req: budgeting request
+ */
+static int calc_dd_growth(const struct ubifs_info *c,
+                          const struct ubifs_budget_req *req)
+{
+        int dd_growth;
+        dd_growth = req->dirtied_page ? c->page_budget : 0;
+        if (req->dirtied_ino)
+                dd_growth += c->inode_budget << (req->dirtied_ino - 1);
+        if (req->mod_dent)
+                dd_growth += c->dent_budget;
+        dd_growth += req->dirtied_ino_d;
+        return dd_growth;
+}
+/**
+ * ubifs_budget_space - ensure there is enough space to complete an operation.
+ * @c: UBIFS file-system description object
+ * @req: budget request
+ *
+ * This function allocates budget for an operation. It uses pessimistic
+ * approximation of how much flash space the operation needs. The goal of this
+ * function is to make sure UBIFS always has flash space to flush all dirty
+ * pages, dirty inodes, and dirty znodes (liability). This function may force
+ * commit, garbage-collection or write-back. Returns zero in case of success,
+ * %-ENOSPC if there is no free space and other negative error codes in case of
+ * failures.
+ */
+int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req)
+{
+        int uninitialized_var(cmt_retries), uninitialized_var(wb_retries);
+        int err, idx_growth, data_growth, dd_growth;
+        struct retries_info ri;
+        ubifs_assert(req->dirtied_ino <= 4);
+        ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4);
+        data_growth = calc_data_growth(c, req);
+        dd_growth = calc_dd_growth(c, req);
+        if (!data_growth && !dd_growth)
+                return 0;
+        idx_growth = calc_idx_growth(c, req);
+        memset(&ri, 0, sizeof(struct retries_info));
+again:
+        spin_lock(&c->space_lock);
+        ubifs_assert(c->budg_idx_growth >= 0);
+        ubifs_assert(c->budg_data_growth >= 0);
+        ubifs_assert(c->budg_dd_growth >= 0);
+        if (unlikely(c->nospace) && (c->nospace_rp || !can_use_rp(c))) {
+                dbg_budg("no space");
+                spin_unlock(&c->space_lock);
+                return -ENOSPC;
+        }
+        c->budg_idx_growth += idx_growth;
+        c->budg_data_growth += data_growth;
+        c->budg_dd_growth += dd_growth;
+        err = do_budget_space(c);
+        if (likely(!err)) {
+                req->idx_growth = idx_growth;
+                req->data_growth = data_growth;
+                req->dd_growth = dd_growth;
+                spin_unlock(&c->space_lock);
+                return 0;
+        }
+        /* Restore the old values */
+        c->budg_idx_growth -= idx_growth;
+        c->budg_data_growth -= data_growth;
+        c->budg_dd_growth -= dd_growth;
+        spin_unlock(&c->space_lock);
+        if (req->fast) {
+                dbg_budg("no space for fast budgeting");
+                return err;
+        }
+        err = make_free_space(c, &ri);
+        if (err == -EAGAIN) {
+                dbg_budg("try again");
+                cond_resched();
+                goto again;
+        } else if (err == -ENOSPC) {
+                dbg_budg("FS is full, -ENOSPC");
+                c->nospace = 1;
+                if (can_use_rp(c) || c->rp_size == 0)
+                        c->nospace_rp = 1;
+                smp_wmb();
+        } else
+                ubifs_err("cannot budget space, error %d", err);
+        return err;
+}
+/**
+ * ubifs_release_budget - release budgeted free space.
+ * @c: UBIFS file-system description object
+ * @req: budget request
+ *
+ * This function releases the space budgeted by 'ubifs_budget_space()'. Note,
+ * since the index changes (which were budgeted for in @req->idx_growth) will
+ * only be written to the media on commit, this function moves the index budget
+ * from @c->budg_idx_growth to @c->budg_uncommitted_idx. The latter will be
+ * zeroed by the commit operation.
+ */
+void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req)
+{
+        ubifs_assert(req->dirtied_ino <= 4);
+        ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4);
+        if (!req->recalculate) {
+                ubifs_assert(req->idx_growth >= 0);
+                ubifs_assert(req->data_growth >= 0);
+                ubifs_assert(req->dd_growth >= 0);
+        }
+        if (req->recalculate) {
+                req->data_growth = calc_data_growth(c, req);
+                req->dd_growth = calc_dd_growth(c, req);
+                req->idx_growth = calc_idx_growth(c, req);
+        }
+        if (!req->data_growth && !req->dd_growth)
+                return;
+        c->nospace = c->nospace_rp = 0;
+        smp_wmb();
+        spin_lock(&c->space_lock);
+        c->budg_idx_growth -= req->idx_growth;
+        c->budg_uncommitted_idx += req->idx_growth;
+        c->budg_data_growth -= req->data_growth;
+        c->budg_dd_growth -= req->dd_growth;
+        c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+        ubifs_assert(c->budg_idx_growth >= 0);
+        ubifs_assert(c->budg_data_growth >= 0);
+        ubifs_assert(c->min_idx_lebs < c->main_lebs);
+        spin_unlock(&c->space_lock);
+}
+/**
+ * ubifs_convert_page_budget - convert budget of a new page.
+ * @c: UBIFS file-system description object
+ *
+ * This function converts budget which was allocated for a new page of data to
+ * the budget of changing an existing page of data. The latter is smaller then
+ * the former, so this function only does simple re-calculation and does not
+ * involve any write-back.
+ */
+void ubifs_convert_page_budget(struct ubifs_info *c)
+{
+        spin_lock(&c->space_lock);
+        /* Release the index growth reservation */
+        c->budg_idx_growth -= c->max_idx_node_sz << UBIFS_BLOCKS_PER_PAGE_SHIFT;
+        /* Release the data growth reservation */
+        c->budg_data_growth -= c->page_budget;
+        /* Increase the dirty data growth reservation instead */
+        c->budg_dd_growth += c->page_budget;
+        /* And re-calculate the indexing space reservation */
+        c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+        spin_unlock(&c->space_lock);
+}
+/**
+ * ubifs_release_dirty_inode_budget - release dirty inode budget.
+ * @c: UBIFS file-system description object
+ * @ui: UBIFS inode to release the budget for
+ *
+ * This function releases budget corresponding to a dirty inode. It is usually
+ * called when after the inode has been written to the media and marked as
+ * clean.
+ */
+void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
+                                      struct ubifs_inode *ui)
+{
+        struct ubifs_budget_req req = {.dd_growth = c->inode_budget,
+                                       .dirtied_ino_d = ui->data_len};
+        ubifs_release_budget(c, &req);
+}
+/**
+ * ubifs_budg_get_free_space - return amount of free space.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns amount of free space on the file-system.
+ */
+long long ubifs_budg_get_free_space(struct ubifs_info *c)
+{
+        int min_idx_lebs, rsvd_idx_lebs;
+        long long available, outstanding, free;
+        /* Do exactly the same calculations as in 'do_budget_space()' */
+        spin_lock(&c->space_lock);
+        min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+        if (min_idx_lebs > c->lst.idx_lebs)
+                rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs;
+        else
+                rsvd_idx_lebs = 0;
+        if (rsvd_idx_lebs > c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt
+                                - c->lst.taken_empty_lebs) {
+                spin_unlock(&c->space_lock);
+                return 0;
+        }
+        available = ubifs_calc_available(c, min_idx_lebs);
+        outstanding = c->budg_data_growth + c->budg_dd_growth;
+        c->min_idx_lebs = min_idx_lebs;
+        spin_unlock(&c->space_lock);
+        if (available > outstanding)
+                free = ubifs_reported_space(c, available - outstanding);
+        else
+                free = 0;
+        return free;
+}
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
new file mode 100644
index 000000000000..3b516316c9b3
--- /dev/null
+++ b/fs/ubifs/commit.c
@@ -0,0 +1,677 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+/*
+ * This file implements functions that manage the running of the commit process.
+ * Each affected module has its own functions to accomplish their part in the
+ * commit and those functions are called here.
+ *
+ * The commit is the process whereby all updates to the index and LEB properties
+ * are written out together and the journal becomes empty. This keeps the
+ * file system consistent - at all times the state can be recreated by reading
+ * the index and LEB properties and then replaying the journal.
+ *
+ * The commit is split into two parts named "commit start" and "commit end".
+ * During commit start, the commit process has exclusive access to the journal
+ * by holding the commit semaphore down for writing. As few I/O operations as
+ * possible are performed during commit start, instead the nodes that are to be
+ * written are merely identified. During commit end, the commit semaphore is no
+ * longer held and the journal is again in operation, allowing users to continue
+ * to use the file system while the bulk of the commit I/O is performed. The
+ * purpose of this two-step approach is to prevent the commit from causing any
+ * latency blips. Note that in any case, the commit does not prevent lookups
+ * (as permitted by the TNC mutex), or access to VFS data structures e.g. page
+ * cache.
+ */
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include "ubifs.h"
+/**
+ * do_commit - commit the journal.
+ * @c: UBIFS file-system description object
+ *
+ * This function implements UBIFS commit. It has to be called with commit lock
+ * locked. Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+static int do_commit(struct ubifs_info *c)
+{
+        int err, new_ltail_lnum, old_ltail_lnum, i;
+        struct ubifs_zbranch zroot;
+        struct ubifs_lp_stats lst;
+        dbg_cmt("start");
+        if (c->ro_media) {
+                err = -EROFS;
+                goto out_up;
+        }
+        /* Sync all write buffers (necessary for recovery) */
+        for (i = 0; i < c->jhead_cnt; i++) {
+                err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
+                if (err)
+                        goto out_up;
+        }
+        err = ubifs_gc_start_commit(c);
+        if (err)
+                goto out_up;
+        err = dbg_check_lprops(c);
+        if (err)
+                goto out_up;
+        err = ubifs_log_start_commit(c, &new_ltail_lnum);
+        if (err)
+                goto out_up;
+        err = ubifs_tnc_start_commit(c, &zroot);
+        if (err)
+                goto out_up;
+        err = ubifs_lpt_start_commit(c);
+        if (err)
+                goto out_up;
+        err = ubifs_orphan_start_commit(c);
+        if (err)
+                goto out_up;
+        ubifs_get_lp_stats(c, &lst);
+        up_write(&c->commit_sem);
+        err = ubifs_tnc_end_commit(c);
+        if (err)
+                goto out;
+        err = ubifs_lpt_end_commit(c);
+        if (err)
+                goto out;
+        err = ubifs_orphan_end_commit(c);
+        if (err)
+                goto out;
+        old_ltail_lnum = c->ltail_lnum;
+        err = ubifs_log_end_commit(c, new_ltail_lnum);
+        if (err)
+                goto out;
+        err = dbg_check_old_index(c, &zroot);
+        if (err)
+                goto out;
+        mutex_lock(&c->mst_mutex);
+        c->mst_node->cmt_no      = cpu_to_le64(++c->cmt_no);
+        c->mst_node->log_lnum    = cpu_to_le32(new_ltail_lnum);
+        c->mst_node->root_lnum   = cpu_to_le32(zroot.lnum);
+        c->mst_node->root_offs   = cpu_to_le32(zroot.offs);
+        c->mst_node->root_len    = cpu_to_le32(zroot.len);
+        c->mst_node->ihead_lnum  = cpu_to_le32(c->ihead_lnum);
+        c->mst_node->ihead_offs  = cpu_to_le32(c->ihead_offs);
+        c->mst_node->index_size  = cpu_to_le64(c->old_idx_sz);
+        c->mst_node->lpt_lnum    = cpu_to_le32(c->lpt_lnum);
+        c->mst_node->lpt_offs    = cpu_to_le32(c->lpt_offs);
+        c->mst_node->nhead_lnum  = cpu_to_le32(c->nhead_lnum);
+        c->mst_node->nhead_offs  = cpu_to_le32(c->nhead_offs);
+        c->mst_node->ltab_lnum   = cpu_to_le32(c->ltab_lnum);
+        c->mst_node->ltab_offs   = cpu_to_le32(c->ltab_offs);
+        c->mst_node->lsave_lnum  = cpu_to_le32(c->lsave_lnum);
+        c->mst_node->lsave_offs  = cpu_to_le32(c->lsave_offs);
+        c->mst_node->lscan_lnum  = cpu_to_le32(c->lscan_lnum);
+        c->mst_node->empty_lebs  = cpu_to_le32(lst.empty_lebs);
+        c->mst_node->idx_lebs    = cpu_to_le32(lst.idx_lebs);
+        c->mst_node->total_free  = cpu_to_le64(lst.total_free);
+        c->mst_node->total_dirty = cpu_to_le64(lst.total_dirty);
+        c->mst_node->total_used  = cpu_to_le64(lst.total_used);
+        c->mst_node->total_dead  = cpu_to_le64(lst.total_dead);
+        c->mst_node->total_dark  = cpu_to_le64(lst.total_dark);
+        if (c->no_orphs)
+                c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
+        else
+                c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_NO_ORPHS);
+        err = ubifs_write_master(c);
+        mutex_unlock(&c->mst_mutex);
+        if (err)
+                goto out;
+        err = ubifs_log_post_commit(c, old_ltail_lnum);
+        if (err)
+                goto out;
+        err = ubifs_gc_end_commit(c);
+        if (err)
+                goto out;
+        err = ubifs_lpt_post_commit(c);
+        if (err)
+                goto out;
+        spin_lock(&c->cs_lock);
+        c->cmt_state = COMMIT_RESTING;
+        wake_up(&c->cmt_wq);
+        dbg_cmt("commit end");
+        spin_unlock(&c->cs_lock);
+        return 0;
+out_up:
+        up_write(&c->commit_sem);
+out:
+        ubifs_err("commit failed, error %d", err);
+        spin_lock(&c->cs_lock);
+        c->cmt_state = COMMIT_BROKEN;
+        wake_up(&c->cmt_wq);
+        spin_unlock(&c->cs_lock);
+        ubifs_ro_mode(c, err);
+        return err;
+}
+/**
+ * run_bg_commit - run background commit if it is needed.
+ * @c: UBIFS file-system description object
+ *
+ * This function runs background commit if it is needed. Returns zero in case
+ * of success and a negative error code in case of failure.
+ */
+static int run_bg_commit(struct ubifs_info *c)
+{
+        spin_lock(&c->cs_lock);
+        /*
+         * Run background commit only if background commit was requested or if
+         * commit is required.
+         */
+        if (c->cmt_state != COMMIT_BACKGROUND &&
+            c->cmt_state != COMMIT_REQUIRED)
+                goto out;
+        spin_unlock(&c->cs_lock);
+        down_write(&c->commit_sem);
+        spin_lock(&c->cs_lock);
+        if (c->cmt_state == COMMIT_REQUIRED)
+                c->cmt_state = COMMIT_RUNNING_REQUIRED;
+        else if (c->cmt_state == COMMIT_BACKGROUND)
+                c->cmt_state = COMMIT_RUNNING_BACKGROUND;
+        else
+                goto out_cmt_unlock;
+        spin_unlock(&c->cs_lock);
+        return do_commit(c);
+out_cmt_unlock:
+        up_write(&c->commit_sem);
+out:
+        spin_unlock(&c->cs_lock);
+        return 0;
+}
+/**
+ * ubifs_bg_thread - UBIFS background thread function.
+ * @info: points to the file-system description object
+ *
+ * This function implements various file-system background activities:
+ * o when a write-buffer timer expires it synchronizes the appropriate
+ *   write-buffer;
+ * o when the journal is about to be full, it starts in-advance commit.
+ *
+ * Note, other stuff like background garbage collection may be added here in
+ * future.
+ */
+int ubifs_bg_thread(void *info)
+{
+        int err;
+        struct ubifs_info *c = info;
+        ubifs_msg("background thread \"%s\" started, PID %d",
+                  c->bgt_name, current->pid);
+        set_freezable();
+        while (1) {
+                if (kthread_should_stop())
+                        break;
+                if (try_to_freeze())
+                        continue;
+                set_current_state(TASK_INTERRUPTIBLE);
+                /* Check if there is something to do */
+                if (!c->need_bgt) {
+                        /*
+                         * Nothing prevents us from going sleep now and
+                         * be never woken up and block the task which
+                         * could wait in 'kthread_stop()' forever.
+                         */
+                        if (kthread_should_stop())
+                                break;
+                        schedule();
+                        continue;
+                } else
+                        __set_current_state(TASK_RUNNING);
+                c->need_bgt = 0;
+                err = ubifs_bg_wbufs_sync(c);
+                if (err)
+                        ubifs_ro_mode(c, err);
+                run_bg_commit(c);
+                cond_resched();
+        }
+        dbg_msg("background thread \"%s\" stops", c->bgt_name);
+        return 0;
+}
+/**
+ * ubifs_commit_required - set commit state to "required".
+ * @c: UBIFS file-system description object
+ *
+ * This function is called if a commit is required but cannot be done from the
+ * calling function, so it is just flagged instead.
+ */
+void ubifs_commit_required(struct ubifs_info *c)
+{
+        spin_lock(&c->cs_lock);
+        switch (c->cmt_state) {
+        case COMMIT_RESTING:
+        case COMMIT_BACKGROUND:
+                dbg_cmt("old: %s, new: %s", dbg_cstate(c->cmt_state),
+                        dbg_cstate(COMMIT_REQUIRED));
+                c->cmt_state = COMMIT_REQUIRED;
+                break;
+        case COMMIT_RUNNING_BACKGROUND:
+                dbg_cmt("old: %s, new: %s", dbg_cstate(c->cmt_state),
+                        dbg_cstate(COMMIT_RUNNING_REQUIRED));
+                c->cmt_state = COMMIT_RUNNING_REQUIRED;
+                break;
+        case COMMIT_REQUIRED:
+        case COMMIT_RUNNING_REQUIRED:
+        case COMMIT_BROKEN:
+                break;
+        }
+        spin_unlock(&c->cs_lock);
+}
+/**
+ * ubifs_request_bg_commit - notify the background thread to do a commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function is called if the journal is full enough to make a commit
+ * worthwhile, so background thread is kicked to start it.
+ */
+void ubifs_request_bg_commit(struct ubifs_info *c)
+{
+        spin_lock(&c->cs_lock);
+        if (c->cmt_state == COMMIT_RESTING) {
+                dbg_cmt("old: %s, new: %s", dbg_cstate(c->cmt_state),
+                        dbg_cstate(COMMIT_BACKGROUND));
+                c->cmt_state = COMMIT_BACKGROUND;
+                spin_unlock(&c->cs_lock);
+                ubifs_wake_up_bgt(c);
+        } else
+                spin_unlock(&c->cs_lock);
+}
+/**
+ * wait_for_commit - wait for commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function sleeps until the commit operation is no longer running.
+ */
+static int wait_for_commit(struct ubifs_info *c)
+{
+        dbg_cmt("pid %d goes sleep", current->pid);
+        /*
+         * The following sleeps if the condition is false, and will be woken
+         * when the commit ends. It is possible, although very unlikely, that we
+         * will wake up and see the subsequent commit running, rather than the
+         * one we were waiting for, and go back to sleep.  However, we will be
+         * woken again, so there is no danger of sleeping forever.
+         */
+        wait_event(c->cmt_wq, c->cmt_state != COMMIT_RUNNING_BACKGROUND &&
+                              c->cmt_state != COMMIT_RUNNING_REQUIRED);
+        dbg_cmt("commit finished, pid %d woke up", current->pid);
+        return 0;
+}
+/**
+ * ubifs_run_commit - run or wait for commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function runs commit and returns zero in case of success and a negative
+ * error code in case of failure.
+ */
+int ubifs_run_commit(struct ubifs_info *c)
+{
+        int err = 0;
+        spin_lock(&c->cs_lock);
+        if (c->cmt_state == COMMIT_BROKEN) {
+                err = -EINVAL;
+                goto out;
+        }
+        if (c->cmt_state == COMMIT_RUNNING_BACKGROUND)
+                /*
+                 * We set the commit state to 'running required' to indicate
+                 * that we want it to complete as quickly as possible.
+                 */
+                c->cmt_state = COMMIT_RUNNING_REQUIRED;
+        if (c->cmt_state == COMMIT_RUNNING_REQUIRED) {
+                spin_unlock(&c->cs_lock);
+                return wait_for_commit(c);
+        }
+        spin_unlock(&c->cs_lock);
+        /* Ok, the commit is indeed needed */
+        down_write(&c->commit_sem);
+        spin_lock(&c->cs_lock);
+        /*
+         * Since we unlocked 'c->cs_lock', the state may have changed, so
+         * re-check it.
+         */
+        if (c->cmt_state == COMMIT_BROKEN) {
+                err = -EINVAL;
+                goto out_cmt_unlock;
+        }
+        if (c->cmt_state == COMMIT_RUNNING_BACKGROUND)
+                c->cmt_state = COMMIT_RUNNING_REQUIRED;
+        if (c->cmt_state == COMMIT_RUNNING_REQUIRED) {
+                up_write(&c->commit_sem);
+                spin_unlock(&c->cs_lock);
+                return wait_for_commit(c);
+        }
+        c->cmt_state = COMMIT_RUNNING_REQUIRED;
+        spin_unlock(&c->cs_lock);
+        err = do_commit(c);
+        return err;
+out_cmt_unlock:
+        up_write(&c->commit_sem);
+out:
+        spin_unlock(&c->cs_lock);
+        return err;
+}
+/**
+ * ubifs_gc_should_commit - determine if it is time for GC to run commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function is called by garbage collection to determine if commit should
+ * be run. If commit state is @COMMIT_BACKGROUND, which means that the journal
+ * is full enough to start commit, this function returns true. It is not
+ * absolutely necessary to commit yet, but it feels like this should be better
+ * then to keep doing GC. This function returns %1 if GC has to initiate commit
+ * and %0 if not.
+ */
+int ubifs_gc_should_commit(struct ubifs_info *c)
+{
+        int ret = 0;
+        spin_lock(&c->cs_lock);
+        if (c->cmt_state == COMMIT_BACKGROUND) {
+                dbg_cmt("commit required now");
+                c->cmt_state = COMMIT_REQUIRED;
+        } else
+                dbg_cmt("commit not requested");
+        if (c->cmt_state == COMMIT_REQUIRED)
+                ret = 1;
+        spin_unlock(&c->cs_lock);
+        return ret;
+}
+#ifdef CONFIG_UBIFS_FS_DEBUG
+/**
+ * struct idx_node - hold index nodes during index tree traversal.
+ * @list: list
+ * @iip: index in parent (slot number of this indexing node in the parent
+ *       indexing node)
+ * @upper_key: all keys in this indexing node have to be less or equivalent to
+ *             this key
+ * @idx: index node (8-byte aligned because all node structures must be 8-byte
+ *       aligned)
+ */
+struct idx_node {
+        struct list_head list;
+        int iip;
+        union ubifs_key upper_key;
+        struct ubifs_idx_node idx __attribute__((aligned(8)));
+};
+/**
+ * dbg_old_index_check_init - get information for the next old index check.
+ * @c: UBIFS file-system description object
+ * @zroot: root of the index
+ *
+ * This function records information about the index that will be needed for the
+ * next old index check i.e. 'dbg_check_old_index()'.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot)
+{
+        struct ubifs_idx_node *idx;
+        int lnum, offs, len, err = 0;
+        c->old_zroot = *zroot;
+        lnum = c->old_zroot.lnum;
+        offs = c->old_zroot.offs;
+        len = c->old_zroot.len;
+        idx = kmalloc(c->max_idx_node_sz, GFP_NOFS);
+        if (!idx)
+                return -ENOMEM;
+        err = ubifs_read_node(c, idx, UBIFS_IDX_NODE, len, lnum, offs);
+        if (err)
+                goto out;
+        c->old_zroot_level = le16_to_cpu(idx->level);
+        c->old_zroot_sqnum = le64_to_cpu(idx->ch.sqnum);
+out:
+        kfree(idx);
+        return err;
+}
+/**
+ * dbg_check_old_index - check the old copy of the index.
+ * @c: UBIFS file-system description object
+ * @zroot: root of the new index
+ *
+ * In order to be able to recover from an unclean unmount, a complete copy of
+ * the index must exist on flash. This is the "old" index. The commit process
+ * must write the "new" index to flash without overwriting or destroying any
+ * part of the old index. This function is run at commit end in order to check
+ * that the old index does indeed exist completely intact.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot)
+{
+        int lnum, offs, len, err = 0, uninitialized_var(last_level), child_cnt;
+        int first = 1, iip;
+        union ubifs_key lower_key, upper_key, l_key, u_key;
+        unsigned long long uninitialized_var(last_sqnum);
+        struct ubifs_idx_node *idx;
+        struct list_head list;
+        struct idx_node *i;
+        size_t sz;
+        if (!(ubifs_chk_flags & UBIFS_CHK_OLD_IDX))
+                goto out;
+        INIT_LIST_HEAD(&list);
+        sz = sizeof(struct idx_node) + ubifs_idx_node_sz(c, c->fanout) -
+             UBIFS_IDX_NODE_SZ;
+        /* Start at the old zroot */
+        lnum = c->old_zroot.lnum;
+        offs = c->old_zroot.offs;
+        len = c->old_zroot.len;
+        iip = 0;
+        /*
+         * Traverse the index tree preorder depth-first i.e. do a node and then
+         * its subtrees from left to right.
+         */
+        while (1) {
+                struct ubifs_branch *br;
+                /* Get the next index node */
+                i = kmalloc(sz, GFP_NOFS);
+                if (!i) {
+                        err = -ENOMEM;
+                        goto out_free;
+                }
+                i->iip = iip;
+                /* Keep the index nodes on our path in a linked list */
+                list_add_tail(&i->list, &list);
+                /* Read the index node */
+                idx = &i->idx;
+                err = ubifs_read_node(c, idx, UBIFS_IDX_NODE, len, lnum, offs);
+                if (err)
+                        goto out_free;
+                /* Validate index node */
+                child_cnt = le16_to_cpu(idx->child_cnt);
+                if (child_cnt < 1 || child_cnt > c->fanout) {
+                        err = 1;
+                        goto out_dump;
+                }
+                if (first) {
+                        first = 0;
+                        /* Check root level and sqnum */
+                        if (le16_to_cpu(idx->level) != c->old_zroot_level) {
+                                err = 2;
+                                goto out_dump;
+                        }
+                        if (le64_to_cpu(idx->ch.sqnum) != c->old_zroot_sqnum) {
+                                err = 3;
+                                goto out_dump;
+                        }
+                        /* Set last values as though root had a parent */
+                        last_level = le16_to_cpu(idx->level) + 1;
+                        last_sqnum = le64_to_cpu(idx->ch.sqnum) + 1;
+                        key_read(c, ubifs_idx_key(c, idx), &lower_key);
+                        highest_ino_key(c, &upper_key, INUM_WATERMARK);
+                }
+                key_copy(c, &upper_key, &i->upper_key);
+                if (le16_to_cpu(idx->level) != last_level - 1) {
+                        err = 3;
+                        goto out_dump;
+                }
+                /*
+                 * The index is always written bottom up hence a child's sqnum
+                 * is always less than the parents.
+                 */
+                if (le64_to_cpu(idx->ch.sqnum) >= last_sqnum) {
+                        err = 4;
+                        goto out_dump;
+                }
+                /* Check key range */
+                key_read(c, ubifs_idx_key(c, idx), &l_key);
+                br = ubifs_idx_branch(c, idx, child_cnt - 1);
+                key_read(c, &br->key, &u_key);
+                if (keys_cmp(c, &lower_key, &l_key) > 0) {
+                        err = 5;
+                        goto out_dump;
+                }
+                if (keys_cmp(c, &upper_key, &u_key) < 0) {
+                        err = 6;
+                        goto out_dump;
+                }
+                if (keys_cmp(c, &upper_key, &u_key) == 0)
+                        if (!is_hash_key(c, &u_key)) {
+                                err = 7;
+                                goto out_dump;
+                        }
+                /* Go to next index node */
+                if (le16_to_cpu(idx->level) == 0) {
+                        /* At the bottom, so go up until can go right */
+                        while (1) {
+                                /* Drop the bottom of the list */
+                                list_del(&i->list);
+                                kfree(i);
+                                /* No more list means we are done */
+                                if (list_empty(&list))
+                                        goto out;
+                                /* Look at the new bottom */
+                                i = list_entry(list.prev, struct idx_node,
+                                               list);
+                                idx = &i->idx;
+                                /* Can we go right */
+                                if (iip + 1 < le16_to_cpu(idx->child_cnt)) {
+                                        iip = iip + 1;
+                                        break;
+                                } else
+                                        /* Nope, so go up again */
+                                        iip = i->iip;
+                        }
+                } else
+                        /* Go down left */
+                        iip = 0;
+                /*
+                 * We have the parent in 'idx' and now we set up for reading the
+                 * child pointed to by slot 'iip'.
+                 */
+                last_level = le16_to_cpu(idx->level);
+                last_sqnum = le64_to_cpu(idx->ch.sqnum);
+                br = ubifs_idx_branch(c, idx, iip);
+                lnum = le32_to_cpu(br->lnum);
+                offs = le32_to_cpu(br->offs);
+                len = le32_to_cpu(br->len);
+                key_read(c, &br->key, &lower_key);
+                if (iip + 1 < le16_to_cpu(idx->child_cnt)) {
+                        br = ubifs_idx_branch(c, idx, iip + 1);
+                        key_read(c, &br->key, &upper_key);
+                } else
+                        key_copy(c, &i->upper_key, &upper_key);
+        }
+out:
+        err = dbg_old_index_check_init(c, zroot);
+        if (err)
+                goto out_free;
+        return 0;
+out_dump:
+        dbg_err("dumping index node (iip=%d)", i->iip);
+        dbg_dump_node(c, idx);
+        list_del(&i->list);
+        kfree(i);
+        if (!list_empty(&list)) {
+                i = list_entry(list.prev, struct idx_node, list);
+                dbg_err("dumping parent index node");
+                dbg_dump_node(c, &i->idx);
+        }
+out_free:
+        while (!list_empty(&list)) {
+                i = list_entry(list.next, struct idx_node, list);
+                list_del(&i->list);
+                kfree(i);
+        }
+        ubifs_err("failed, error %d", err);
+        if (err > 0)
+                err = -EINVAL;
+        return err;
+}
+#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/compress.c b/fs/ubifs/compress.c
new file mode 100644
index 000000000000..5bb51dac3c16
--- /dev/null
+++ b/fs/ubifs/compress.c
@@ -0,0 +1,253 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ * Copyright (C) 2006, 2007 University of Szeged, Hungary
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ *          Zoltan Sogor
+ */
+/*
+ * This file provides a single place to access to compression and
+ * decompression.
+ */
+#include <linux/crypto.h>
+#include "ubifs.h"
+/* Fake description object for the "none" compressor */
+static struct ubifs_compressor none_compr = {
+        .compr_type = UBIFS_COMPR_NONE,
+        .name = "no compression",
+        .capi_name = "",
+};
+#ifdef CONFIG_UBIFS_FS_LZO
+static DEFINE_MUTEX(lzo_mutex);
+static struct ubifs_compressor lzo_compr = {
+        .compr_type = UBIFS_COMPR_LZO,
+        .comp_mutex = &lzo_mutex,
+        .name = "LZO",
+        .capi_name = "lzo",
+};
+#else
+static struct ubifs_compressor lzo_compr = {
+        .compr_type = UBIFS_COMPR_LZO,
+        .name = "LZO",
+};
+#endif
+#ifdef CONFIG_UBIFS_FS_ZLIB
+static DEFINE_MUTEX(deflate_mutex);
+static DEFINE_MUTEX(inflate_mutex);
+static struct ubifs_compressor zlib_compr = {
+        .compr_type = UBIFS_COMPR_ZLIB,
+        .comp_mutex = &deflate_mutex,
+        .decomp_mutex = &inflate_mutex,
+        .name = "zlib",
+        .capi_name = "deflate",
+};
+#else
+static struct ubifs_compressor zlib_compr = {
+        .compr_type = UBIFS_COMPR_ZLIB,
+        .name = "zlib",
+};
+#endif
+/* All UBIFS compressors */
+struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
+/**
+ * ubifs_compress - compress data.
+ * @in_buf: data to compress
+ * @in_len: length of the data to compress
+ * @out_buf: output buffer where compressed data should be stored
+ * @out_len: output buffer length is returned here
+ * @compr_type: type of compression to use on enter, actually used compression
+ *              type on exit
+ *
+ * This function compresses input buffer @in_buf of length @in_len and stores
+ * the result in the output buffer @out_buf and the resulting length in
+ * @out_len. If the input buffer does not compress, it is just copied to the
+ * @out_buf. The same happens if @compr_type is %UBIFS_COMPR_NONE or if
+ * compression error occurred.
+ *
+ * Note, if the input buffer was not compressed, it is copied to the output
+ * buffer and %UBIFS_COMPR_NONE is returned in @compr_type.
+ *
+ * This functions returns %0 on success or a negative error code on failure.
+ */
+void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len,
+                    int *compr_type)
+{
+        int err;
+        struct ubifs_compressor *compr = ubifs_compressors[*compr_type];
+        if (*compr_type == UBIFS_COMPR_NONE)
+                goto no_compr;
+        /* If the input data is small, do not even try to compress it */
+        if (in_len < UBIFS_MIN_COMPR_LEN)
+                goto no_compr;
+        if (compr->comp_mutex)
+                mutex_lock(compr->comp_mutex);
+        err = crypto_comp_compress(compr->cc, in_buf, in_len, out_buf,
+                                   out_len);
+        if (compr->comp_mutex)
+                mutex_unlock(compr->comp_mutex);
+        if (unlikely(err)) {
+                ubifs_warn("cannot compress %d bytes, compressor %s, "
+                           "error %d, leave data uncompressed",
+                           in_len, compr->name, err);
+                 goto no_compr;
+        }
+        /*
+         * Presently, we just require that compression results in less data,
+         * rather than any defined minimum compression ratio or amount.
+         */
+        if (ALIGN(*out_len, 8) >= ALIGN(in_len, 8))
+                goto no_compr;
+        return;
+no_compr:
+        memcpy(out_buf, in_buf, in_len);
+        *out_len = in_len;
+        *compr_type = UBIFS_COMPR_NONE;
+}
+/**
+ * ubifs_decompress - decompress data.
+ * @in_buf: data to decompress
+ * @in_len: length of the data to decompress
+ * @out_buf: output buffer where decompressed data should
+ * @out_len: output length is returned here
+ * @compr_type: type of compression
+ *
+ * This function decompresses data from buffer @in_buf into buffer @out_buf.
+ * The length of the uncompressed data is returned in @out_len. This functions
+ * returns %0 on success or a negative error code on failure.
+ */
+int ubifs_decompress(const void *in_buf, int in_len, void *out_buf,
+                     int *out_len, int compr_type)
+{
+        int err;
+        struct ubifs_compressor *compr;
+        if (unlikely(compr_type < 0 || compr_type >= UBIFS_COMPR_TYPES_CNT)) {
+                ubifs_err("invalid compression type %d", compr_type);
+                return -EINVAL;
+        }
+        compr = ubifs_compressors[compr_type];
+        if (unlikely(!compr->capi_name)) {
+                ubifs_err("%s compression is not compiled in", compr->name);
+                return -EINVAL;
+        }
+        if (compr_type == UBIFS_COMPR_NONE) {
+                memcpy(out_buf, in_buf, in_len);
+                *out_len = in_len;
+                return 0;
+        }
+        if (compr->decomp_mutex)
+                mutex_lock(compr->decomp_mutex);
+        err = crypto_comp_decompress(compr->cc, in_buf, in_len, out_buf,
+                                     out_len);
+        if (compr->decomp_mutex)
+                mutex_unlock(compr->decomp_mutex);
+        if (err)
+                ubifs_err("cannot decompress %d bytes, compressor %s, "
+                          "error %d", in_len, compr->name, err);
+        return err;
+}
+/**
+ * compr_init - initialize a compressor.
+ * @compr: compressor description object
+ *
+ * This function initializes the requested compressor and returns zero in case
+ * of success or a negative error code in case of failure.
+ */
+static int __init compr_init(struct ubifs_compressor *compr)
+{
+        if (compr->capi_name) {
+                compr->cc = crypto_alloc_comp(compr->capi_name, 0, 0);
+                if (IS_ERR(compr->cc)) {
+                        ubifs_err("cannot initialize compressor %s, error %ld",
+                                  compr->name, PTR_ERR(compr->cc));
+                        return PTR_ERR(compr->cc);
+                }
+        }
+        ubifs_compressors[compr->compr_type] = compr;
+        return 0;
+}
+/**
+ * compr_exit - de-initialize a compressor.
+ * @compr: compressor description object
+ */
+static void compr_exit(struct ubifs_compressor *compr)
+{
+        if (compr->capi_name)
+                crypto_free_comp(compr->cc);
+        return;
+}
+/**
+ * ubifs_compressors_init - initialize UBIFS compressors.
+ *
+ * This function initializes the compressor which were compiled in. Returns
+ * zero in case of success and a negative error code in case of failure.
+ */
+int __init ubifs_compressors_init(void)
+{
+        int err;
+        err = compr_init(&lzo_compr);
+        if (err)
+                return err;
+        err = compr_init(&zlib_compr);
+        if (err)
+                goto out_lzo;
+        ubifs_compressors[UBIFS_COMPR_NONE] = &none_compr;
+        return 0;
+out_lzo:
+        compr_exit(&lzo_compr);
+        return err;
+}
+/**
+ * ubifs_compressors_exit - de-initialize UBIFS compressors.
+ */
+void __exit ubifs_compressors_exit(void)
+{
+        compr_exit(&lzo_compr);
+        compr_exit(&zlib_compr);
+}
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
new file mode 100644
index 000000000000..4e3aaeba4eca
--- /dev/null
+++ b/fs/ubifs/debug.c
@@ -0,0 +1,2289 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+/*
+ * This file implements most of the debugging stuff which is compiled in only
+ * when it is enabled. But some debugging check functions are implemented in
+ * corresponding subsystem, just because they are closely related and utilize
+ * various local functions of those subsystems.
+ */
+#define UBIFS_DBG_PRESERVE_UBI
+#include "ubifs.h"
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#ifdef CONFIG_UBIFS_FS_DEBUG
+DEFINE_SPINLOCK(dbg_lock);
+static char dbg_key_buf0[128];
+static char dbg_key_buf1[128];
+unsigned int ubifs_msg_flags = UBIFS_MSG_FLAGS_DEFAULT;
+unsigned int ubifs_chk_flags = UBIFS_CHK_FLAGS_DEFAULT;
+unsigned int ubifs_tst_flags;
+module_param_named(debug_msgs, ubifs_msg_flags, uint, S_IRUGO | S_IWUSR);
+module_param_named(debug_chks, ubifs_chk_flags, uint, S_IRUGO | S_IWUSR);
+module_param_named(debug_tsts, ubifs_tst_flags, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(debug_msgs, "Debug message type flags");
+MODULE_PARM_DESC(debug_chks, "Debug check flags");
+MODULE_PARM_DESC(debug_tsts, "Debug special test flags");
+static const char *get_key_fmt(int fmt)
+{
+        switch (fmt) {
+        case UBIFS_SIMPLE_KEY_FMT:
+                return "simple";
+        default:
+                return "unknown/invalid format";
+        }
+}
+static const char *get_key_hash(int hash)
+{
+        switch (hash) {
+        case UBIFS_KEY_HASH_R5:
+                return "R5";
+        case UBIFS_KEY_HASH_TEST:
+                return "test";
+        default:
+                return "unknown/invalid name hash";
+        }
+}
+static const char *get_key_type(int type)
+{
+        switch (type) {
+        case UBIFS_INO_KEY:
+                return "inode";
+        case UBIFS_DENT_KEY:
+                return "direntry";
+        case UBIFS_XENT_KEY:
+                return "xentry";
+        case UBIFS_DATA_KEY:
+                return "data";
+        case UBIFS_TRUN_KEY:
+                return "truncate";
+        default:
+                return "unknown/invalid key";
+        }
+}
+static void sprintf_key(const struct ubifs_info *c, const union ubifs_key *key,
+                        char *buffer)
+{
+        char *p = buffer;
+        int type = key_type(c, key);
+        if (c->key_fmt == UBIFS_SIMPLE_KEY_FMT) {
+                switch (type) {
+                case UBIFS_INO_KEY:
+                        sprintf(p, "(%lu, %s)", key_inum(c, key),
+                               get_key_type(type));
+                        break;
+                case UBIFS_DENT_KEY:
+                case UBIFS_XENT_KEY:
+                        sprintf(p, "(%lu, %s, %#08x)", key_inum(c, key),
+                                get_key_type(type), key_hash(c, key));
+                        break;
+                case UBIFS_DATA_KEY:
+                        sprintf(p, "(%lu, %s, %u)", key_inum(c, key),
+                                get_key_type(type), key_block(c, key));
+                        break;
+                case UBIFS_TRUN_KEY:
+                        sprintf(p, "(%lu, %s)",
+                                key_inum(c, key), get_key_type(type));
+                        break;
+                default:
+                        sprintf(p, "(bad key type: %#08x, %#08x)",
+                                key->u32[0], key->u32[1]);
+                }
+        } else
+                sprintf(p, "bad key format %d", c->key_fmt);
+}
+const char *dbg_key_str0(const struct ubifs_info *c, const union ubifs_key *key)
+{
+        /* dbg_lock must be held */
+        sprintf_key(c, key, dbg_key_buf0);
+        return dbg_key_buf0;
+}
+const char *dbg_key_str1(const struct ubifs_info *c, const union ubifs_key *key)
+{
+        /* dbg_lock must be held */
+        sprintf_key(c, key, dbg_key_buf1);
+        return dbg_key_buf1;
+}
+const char *dbg_ntype(int type)
+{
+        switch (type) {
+        case UBIFS_PAD_NODE:
+                return "padding node";
+        case UBIFS_SB_NODE:
+                return "superblock node";
+        case UBIFS_MST_NODE:
+                return "master node";
+        case UBIFS_REF_NODE:
+                return "reference node";
+        case UBIFS_INO_NODE:
+                return "inode node";
+        case UBIFS_DENT_NODE:
+                return "direntry node";
+        case UBIFS_XENT_NODE:
+                return "xentry node";
+        case UBIFS_DATA_NODE:
+                return "data node";
+        case UBIFS_TRUN_NODE:
+                return "truncate node";
+        case UBIFS_IDX_NODE:
+                return "indexing node";
+        case UBIFS_CS_NODE:
+                return "commit start node";
+        case UBIFS_ORPH_NODE:
+                return "orphan node";
+        default:
+                return "unknown node";
+        }
+}
+static const char *dbg_gtype(int type)
+{
+        switch (type) {
+        case UBIFS_NO_NODE_GROUP:
+                return "no node group";
+        case UBIFS_IN_NODE_GROUP:
+                return "in node group";
+        case UBIFS_LAST_OF_NODE_GROUP:
+                return "last of node group";
+        default:
+                return "unknown";
+        }
+}
+const char *dbg_cstate(int cmt_state)
+{
+        switch (cmt_state) {
+        case COMMIT_RESTING:
+                return "commit resting";
+        case COMMIT_BACKGROUND:
+                return "background commit requested";
+        case COMMIT_REQUIRED:
+                return "commit required";
+        case COMMIT_RUNNING_BACKGROUND:
+                return "BACKGROUND commit running";
+        case COMMIT_RUNNING_REQUIRED:
+                return "commit running and required";
+        case COMMIT_BROKEN:
+                return "broken commit";
+        default:
+                return "unknown commit state";
+        }
+}
+static void dump_ch(const struct ubifs_ch *ch)
+{
+        printk(KERN_DEBUG "\tmagic          %#x\n", le32_to_cpu(ch->magic));
+        printk(KERN_DEBUG "\tcrc            %#x\n", le32_to_cpu(ch->crc));
+        printk(KERN_DEBUG "\tnode_type      %d (%s)\n", ch->node_type,
+               dbg_ntype(ch->node_type));
+        printk(KERN_DEBUG "\tgroup_type     %d (%s)\n", ch->group_type,
+               dbg_gtype(ch->group_type));
+        printk(KERN_DEBUG "\tsqnum          %llu\n",
+               (unsigned long long)le64_to_cpu(ch->sqnum));
+        printk(KERN_DEBUG "\tlen            %u\n", le32_to_cpu(ch->len));
+}
+void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode)
+{
+        const struct ubifs_inode *ui = ubifs_inode(inode);
+        printk(KERN_DEBUG "inode      %lu\n", inode->i_ino);
+        printk(KERN_DEBUG "size       %llu\n",
+               (unsigned long long)i_size_read(inode));
+        printk(KERN_DEBUG "nlink      %u\n", inode->i_nlink);
+        printk(KERN_DEBUG "uid        %u\n", (unsigned int)inode->i_uid);
+        printk(KERN_DEBUG "gid        %u\n", (unsigned int)inode->i_gid);
+        printk(KERN_DEBUG "atime      %u.%u\n",
+               (unsigned int)inode->i_atime.tv_sec,
+               (unsigned int)inode->i_atime.tv_nsec);
+        printk(KERN_DEBUG "mtime      %u.%u\n",
+               (unsigned int)inode->i_mtime.tv_sec,
+               (unsigned int)inode->i_mtime.tv_nsec);
+        printk(KERN_DEBUG "ctime       %u.%u\n",
+               (unsigned int)inode->i_ctime.tv_sec,
+               (unsigned int)inode->i_ctime.tv_nsec);
+        printk(KERN_DEBUG "creat_sqnum %llu\n", ui->creat_sqnum);
+        printk(KERN_DEBUG "xattr_size  %u\n", ui->xattr_size);
+        printk(KERN_DEBUG "xattr_cnt   %u\n", ui->xattr_cnt);
+        printk(KERN_DEBUG "xattr_names %u\n", ui->xattr_names);
+        printk(KERN_DEBUG "dirty       %u\n", ui->dirty);
+        printk(KERN_DEBUG "xattr       %u\n", ui->xattr);
+        printk(KERN_DEBUG "flags       %d\n", ui->flags);
+        printk(KERN_DEBUG "compr_type  %d\n", ui->compr_type);
+        printk(KERN_DEBUG "data_len    %d\n", ui->data_len);
+}
+void dbg_dump_node(const struct ubifs_info *c, const void *node)
+{
+        int i, n;
+        union ubifs_key key;
+        const struct ubifs_ch *ch = node;
+        if (dbg_failure_mode)
+                return;
+        /* If the magic is incorrect, just hexdump the first bytes */
+        if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC) {
+                printk(KERN_DEBUG "Not a node, first %zu bytes:", UBIFS_CH_SZ);
+                print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 1,
+                               (void *)node, UBIFS_CH_SZ, 1);
+                return;
+        }
+        spin_lock(&dbg_lock);
+        dump_ch(node);
+        switch (ch->node_type) {
+        case UBIFS_PAD_NODE:
+        {
+                const struct ubifs_pad_node *pad = node;
+                printk(KERN_DEBUG "\tpad_len        %u\n",
+                       le32_to_cpu(pad->pad_len));
+                break;
+        }
+        case UBIFS_SB_NODE:
+        {
+                const struct ubifs_sb_node *sup = node;
+                unsigned int sup_flags = le32_to_cpu(sup->flags);
+                printk(KERN_DEBUG "\tkey_hash       %d (%s)\n",
+                       (int)sup->key_hash, get_key_hash(sup->key_hash));
+                printk(KERN_DEBUG "\tkey_fmt        %d (%s)\n",
+                       (int)sup->key_fmt, get_key_fmt(sup->key_fmt));
+                printk(KERN_DEBUG "\tflags          %#x\n", sup_flags);
+                printk(KERN_DEBUG "\t  big_lpt      %u\n",
+                       !!(sup_flags & UBIFS_FLG_BIGLPT));
+                printk(KERN_DEBUG "\tmin_io_size    %u\n",
+                       le32_to_cpu(sup->min_io_size));
+                printk(KERN_DEBUG "\tleb_size       %u\n",
+                       le32_to_cpu(sup->leb_size));
+                printk(KERN_DEBUG "\tleb_cnt        %u\n",
+                       le32_to_cpu(sup->leb_cnt));
+                printk(KERN_DEBUG "\tmax_leb_cnt    %u\n",
+                       le32_to_cpu(sup->max_leb_cnt));
+                printk(KERN_DEBUG "\tmax_bud_bytes  %llu\n",
+                       (unsigned long long)le64_to_cpu(sup->max_bud_bytes));
+                printk(KERN_DEBUG "\tlog_lebs       %u\n",
+                       le32_to_cpu(sup->log_lebs));
+                printk(KERN_DEBUG "\tlpt_lebs       %u\n",
+                       le32_to_cpu(sup->lpt_lebs));
+                printk(KERN_DEBUG "\torph_lebs      %u\n",
+                       le32_to_cpu(sup->orph_lebs));
+                printk(KERN_DEBUG "\tjhead_cnt      %u\n",
+                       le32_to_cpu(sup->jhead_cnt));
+                printk(KERN_DEBUG "\tfanout         %u\n",
+                       le32_to_cpu(sup->fanout));
+                printk(KERN_DEBUG "\tlsave_cnt      %u\n",
+                       le32_to_cpu(sup->lsave_cnt));
+                printk(KERN_DEBUG "\tdefault_compr  %u\n",
+                       (int)le16_to_cpu(sup->default_compr));
+                printk(KERN_DEBUG "\trp_size        %llu\n",
+                       (unsigned long long)le64_to_cpu(sup->rp_size));
+                printk(KERN_DEBUG "\trp_uid         %u\n",
+                       le32_to_cpu(sup->rp_uid));
+                printk(KERN_DEBUG "\trp_gid         %u\n",
+                       le32_to_cpu(sup->rp_gid));
+                printk(KERN_DEBUG "\tfmt_version    %u\n",
+                       le32_to_cpu(sup->fmt_version));
+                printk(KERN_DEBUG "\ttime_gran      %u\n",
+                       le32_to_cpu(sup->time_gran));
+                printk(KERN_DEBUG "\tUUID           %02X%02X%02X%02X-%02X%02X"
+                       "-%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X\n",
+                       sup->uuid[0], sup->uuid[1], sup->uuid[2], sup->uuid[3],
+                       sup->uuid[4], sup->uuid[5], sup->uuid[6], sup->uuid[7],
+                       sup->uuid[8], sup->uuid[9], sup->uuid[10], sup->uuid[11],
+                       sup->uuid[12], sup->uuid[13], sup->uuid[14],
+                       sup->uuid[15]);
+                break;
+        }
+        case UBIFS_MST_NODE:
+        {
+                const struct ubifs_mst_node *mst = node;
+                printk(KERN_DEBUG "\thighest_inum   %llu\n",
+                       (unsigned long long)le64_to_cpu(mst->highest_inum));
+                printk(KERN_DEBUG "\tcommit number  %llu\n",
+                       (unsigned long long)le64_to_cpu(mst->cmt_no));
+                printk(KERN_DEBUG "\tflags          %#x\n",
+                       le32_to_cpu(mst->flags));
+                printk(KERN_DEBUG "\tlog_lnum       %u\n",
+                       le32_to_cpu(mst->log_lnum));
+                printk(KERN_DEBUG "\troot_lnum      %u\n",
+                       le32_to_cpu(mst->root_lnum));
+                printk(KERN_DEBUG "\troot_offs      %u\n",
+                       le32_to_cpu(mst->root_offs));
+                printk(KERN_DEBUG "\troot_len       %u\n",
+                       le32_to_cpu(mst->root_len));
+                printk(KERN_DEBUG "\tgc_lnum        %u\n",
+                       le32_to_cpu(mst->gc_lnum));
+                printk(KERN_DEBUG "\tihead_lnum     %u\n",
+                       le32_to_cpu(mst->ihead_lnum));
+                printk(KERN_DEBUG "\tihead_offs     %u\n",
+                       le32_to_cpu(mst->ihead_offs));
+                printk(KERN_DEBUG "\tindex_size     %u\n",
+                       le32_to_cpu(mst->index_size));
+                printk(KERN_DEBUG "\tlpt_lnum       %u\n",
+                       le32_to_cpu(mst->lpt_lnum));
+                printk(KERN_DEBUG "\tlpt_offs       %u\n",
+                       le32_to_cpu(mst->lpt_offs));
+                printk(KERN_DEBUG "\tnhead_lnum     %u\n",
+                       le32_to_cpu(mst->nhead_lnum));
+                printk(KERN_DEBUG "\tnhead_offs     %u\n",
+                       le32_to_cpu(mst->nhead_offs));
+                printk(KERN_DEBUG "\tltab_lnum      %u\n",
+                       le32_to_cpu(mst->ltab_lnum));
+                printk(KERN_DEBUG "\tltab_offs      %u\n",
+                       le32_to_cpu(mst->ltab_offs));
+                printk(KERN_DEBUG "\tlsave_lnum     %u\n",
+                       le32_to_cpu(mst->lsave_lnum));
+                printk(KERN_DEBUG "\tlsave_offs     %u\n",
+                       le32_to_cpu(mst->lsave_offs));
+                printk(KERN_DEBUG "\tlscan_lnum     %u\n",
+                       le32_to_cpu(mst->lscan_lnum));
+                printk(KERN_DEBUG "\tleb_cnt        %u\n",
+                       le32_to_cpu(mst->leb_cnt));
+                printk(KERN_DEBUG "\tempty_lebs     %u\n",
+                       le32_to_cpu(mst->empty_lebs));
+                printk(KERN_DEBUG "\tidx_lebs       %u\n",
+                       le32_to_cpu(mst->idx_lebs));
+                printk(KERN_DEBUG "\ttotal_free     %llu\n",
+                       (unsigned long long)le64_to_cpu(mst->total_free));
+                printk(KERN_DEBUG "\ttotal_dirty    %llu\n",
+                       (unsigned long long)le64_to_cpu(mst->total_dirty));
+                printk(KERN_DEBUG "\ttotal_used     %llu\n",
+                       (unsigned long long)le64_to_cpu(mst->total_used));
+                printk(KERN_DEBUG "\ttotal_dead     %llu\n",
+                       (unsigned long long)le64_to_cpu(mst->total_dead));
+                printk(KERN_DEBUG "\ttotal_dark     %llu\n",
+                       (unsigned long long)le64_to_cpu(mst->total_dark));
+                break;
+        }
+        case UBIFS_REF_NODE:
+        {
+                const struct ubifs_ref_node *ref = node;
+                printk(KERN_DEBUG "\tlnum           %u\n",
+                       le32_to_cpu(ref->lnum));
+                printk(KERN_DEBUG "\toffs           %u\n",
+                       le32_to_cpu(ref->offs));
+                printk(KERN_DEBUG "\tjhead          %u\n",
+                       le32_to_cpu(ref->jhead));
+                break;
+        }
+        case UBIFS_INO_NODE:
+        {
+                const struct ubifs_ino_node *ino = node;
+                key_read(c, &ino->key, &key);
+                printk(KERN_DEBUG "\tkey            %s\n", DBGKEY(&key));
+                printk(KERN_DEBUG "\tcreat_sqnum    %llu\n",
+                       (unsigned long long)le64_to_cpu(ino->creat_sqnum));
+                printk(KERN_DEBUG "\tsize           %llu\n",
+                       (unsigned long long)le64_to_cpu(ino->size));
+                printk(KERN_DEBUG "\tnlink          %u\n",
+                       le32_to_cpu(ino->nlink));
+                printk(KERN_DEBUG "\tatime          %lld.%u\n",
+                       (long long)le64_to_cpu(ino->atime_sec),
+                       le32_to_cpu(ino->atime_nsec));
+                printk(KERN_DEBUG "\tmtime          %lld.%u\n",
+                       (long long)le64_to_cpu(ino->mtime_sec),
+                       le32_to_cpu(ino->mtime_nsec));
+                printk(KERN_DEBUG "\tctime          %lld.%u\n",
+                       (long long)le64_to_cpu(ino->ctime_sec),
+                       le32_to_cpu(ino->ctime_nsec));
+                printk(KERN_DEBUG "\tuid            %u\n",
+                       le32_to_cpu(ino->uid));
+                printk(KERN_DEBUG "\tgid            %u\n",
+                       le32_to_cpu(ino->gid));
+                printk(KERN_DEBUG "\tmode           %u\n",
+                       le32_to_cpu(ino->mode));
+                printk(KERN_DEBUG "\tflags          %#x\n",
+                       le32_to_cpu(ino->flags));
+                printk(KERN_DEBUG "\txattr_cnt      %u\n",
+                       le32_to_cpu(ino->xattr_cnt));
+                printk(KERN_DEBUG "\txattr_size     %u\n",
+                       le32_to_cpu(ino->xattr_size));
+                printk(KERN_DEBUG "\txattr_names    %u\n",
+                       le32_to_cpu(ino->xattr_names));
+                printk(KERN_DEBUG "\tcompr_type     %#x\n",
+                       (int)le16_to_cpu(ino->compr_type));
+                printk(KERN_DEBUG "\tdata len       %u\n",
+                       le32_to_cpu(ino->data_len));
+                break;
+        }
+        case UBIFS_DENT_NODE:
+        case UBIFS_XENT_NODE:
+        {
+                const struct ubifs_dent_node *dent = node;
+                int nlen = le16_to_cpu(dent->nlen);
+                key_read(c, &dent->key, &key);
+                printk(KERN_DEBUG "\tkey            %s\n", DBGKEY(&key));
+                printk(KERN_DEBUG "\tinum           %llu\n",
+                       (unsigned long long)le64_to_cpu(dent->inum));
+                printk(KERN_DEBUG "\ttype           %d\n", (int)dent->type);
+                printk(KERN_DEBUG "\tnlen           %d\n", nlen);
+                printk(KERN_DEBUG "\tname           ");
+                if (nlen > UBIFS_MAX_NLEN)
+                        printk(KERN_DEBUG "(bad name length, not printing, "
+                                          "bad or corrupted node)");
+                else {
+                        for (i = 0; i < nlen && dent->name[i]; i++)
+                                printk("%c", dent->name[i]);
+                }
+                printk("\n");
+                break;
+        }
+        case UBIFS_DATA_NODE:
+        {
+                const struct ubifs_data_node *dn = node;
+                int dlen = le32_to_cpu(ch->len) - UBIFS_DATA_NODE_SZ;
+                key_read(c, &dn->key, &key);
+                printk(KERN_DEBUG "\tkey            %s\n", DBGKEY(&key));
+                printk(KERN_DEBUG "\tsize           %u\n",
+                       le32_to_cpu(dn->size));
+                printk(KERN_DEBUG "\tcompr_typ      %d\n",
+                       (int)le16_to_cpu(dn->compr_type));
+                printk(KERN_DEBUG "\tdata size      %d\n",
+                       dlen);
+                printk(KERN_DEBUG "\tdata:\n");
+                print_hex_dump(KERN_DEBUG, "\t", DUMP_PREFIX_OFFSET, 32, 1,
+                               (void *)&dn->data, dlen, 0);
+                break;
+        }
+        case UBIFS_TRUN_NODE:
+        {
+                const struct ubifs_trun_node *trun = node;
+                printk(KERN_DEBUG "\tinum           %u\n",
+                       le32_to_cpu(trun->inum));
+                printk(KERN_DEBUG "\told_size       %llu\n",
+                       (unsigned long long)le64_to_cpu(trun->old_size));
+                printk(KERN_DEBUG "\tnew_size       %llu\n",
+                       (unsigned long long)le64_to_cpu(trun->new_size));
+                break;
+        }
+        case UBIFS_IDX_NODE:
+        {
+                const struct ubifs_idx_node *idx = node;
+                n = le16_to_cpu(idx->child_cnt);
+                printk(KERN_DEBUG "\tchild_cnt      %d\n", n);
+                printk(KERN_DEBUG "\tlevel          %d\n",
+                       (int)le16_to_cpu(idx->level));
+                printk(KERN_DEBUG "\tBranches:\n");
+                for (i = 0; i < n && i < c->fanout - 1; i++) {
+                        const struct ubifs_branch *br;
+                        br = ubifs_idx_branch(c, idx, i);
+                        key_read(c, &br->key, &key);
+                        printk(KERN_DEBUG "\t%d: LEB %d:%d len %d key %s\n",
+                               i, le32_to_cpu(br->lnum), le32_to_cpu(br->offs),
+                               le32_to_cpu(br->len), DBGKEY(&key));
+                }
+                break;
+        }
+        case UBIFS_CS_NODE:
+                break;
+        case UBIFS_ORPH_NODE:
+        {
+                const struct ubifs_orph_node *orph = node;
+                printk(KERN_DEBUG "\tcommit number  %llu\n",
+                       (unsigned long long)
+                                le64_to_cpu(orph->cmt_no) & LLONG_MAX);
+                printk(KERN_DEBUG "\tlast node flag %llu\n",
+                       (unsigned long long)(le64_to_cpu(orph->cmt_no)) >> 63);
+                n = (le32_to_cpu(ch->len) - UBIFS_ORPH_NODE_SZ) >> 3;
+                printk(KERN_DEBUG "\t%d orphan inode numbers:\n", n);
+                for (i = 0; i < n; i++)
+                        printk(KERN_DEBUG "\t  ino %llu\n",
+                               le64_to_cpu(orph->inos[i]));
+                break;
+        }
+        default:
+                printk(KERN_DEBUG "node type %d was not recognized\n",
+                       (int)ch->node_type);
+        }
+        spin_unlock(&dbg_lock);
+}
+void dbg_dump_budget_req(const struct ubifs_budget_req *req)
+{
+        spin_lock(&dbg_lock);
+        printk(KERN_DEBUG "Budgeting request: new_ino %d, dirtied_ino %d\n",
+               req->new_ino, req->dirtied_ino);
+        printk(KERN_DEBUG "\tnew_ino_d   %d, dirtied_ino_d %d\n",
+               req->new_ino_d, req->dirtied_ino_d);
+        printk(KERN_DEBUG "\tnew_page    %d, dirtied_page %d\n",
+               req->new_page, req->dirtied_page);
+        printk(KERN_DEBUG "\tnew_dent    %d, mod_dent     %d\n",
+               req->new_dent, req->mod_dent);
+        printk(KERN_DEBUG "\tidx_growth  %d\n", req->idx_growth);
+        printk(KERN_DEBUG "\tdata_growth %d dd_growth     %d\n",
+               req->data_growth, req->dd_growth);
+        spin_unlock(&dbg_lock);
+}
+void dbg_dump_lstats(const struct ubifs_lp_stats *lst)
+{
+        spin_lock(&dbg_lock);
+        printk(KERN_DEBUG "Lprops statistics: empty_lebs %d, idx_lebs  %d\n",
+               lst->empty_lebs, lst->idx_lebs);
+        printk(KERN_DEBUG "\ttaken_empty_lebs %d, total_free %lld, "
+               "total_dirty %lld\n", lst->taken_empty_lebs, lst->total_free,
+               lst->total_dirty);
+        printk(KERN_DEBUG "\ttotal_used %lld, total_dark %lld, "
+               "total_dead %lld\n", lst->total_used, lst->total_dark,
+               lst->total_dead);
+        spin_unlock(&dbg_lock);
+}
+void dbg_dump_budg(struct ubifs_info *c)
+{
+        int i;
+        struct rb_node *rb;
+        struct ubifs_bud *bud;
+        struct ubifs_gced_idx_leb *idx_gc;
+        spin_lock(&dbg_lock);
+        printk(KERN_DEBUG "Budgeting info: budg_data_growth %lld, "
+               "budg_dd_growth %lld, budg_idx_growth %lld\n",
+               c->budg_data_growth, c->budg_dd_growth, c->budg_idx_growth);
+        printk(KERN_DEBUG "\tdata budget sum %lld, total budget sum %lld, "
+               "freeable_cnt %d\n", c->budg_data_growth + c->budg_dd_growth,
+               c->budg_data_growth + c->budg_dd_growth + c->budg_idx_growth,
+               c->freeable_cnt);
+        printk(KERN_DEBUG "\tmin_idx_lebs %d, old_idx_sz %lld, "
+               "calc_idx_sz %lld, idx_gc_cnt %d\n", c->min_idx_lebs,
+               c->old_idx_sz, c->calc_idx_sz, c->idx_gc_cnt);
+        printk(KERN_DEBUG "\tdirty_pg_cnt %ld, dirty_zn_cnt %ld, "
+               "clean_zn_cnt %ld\n", atomic_long_read(&c->dirty_pg_cnt),
+               atomic_long_read(&c->dirty_zn_cnt),
+               atomic_long_read(&c->clean_zn_cnt));
+        printk(KERN_DEBUG "\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n",
+               c->dark_wm, c->dead_wm, c->max_idx_node_sz);
+        printk(KERN_DEBUG "\tgc_lnum %d, ihead_lnum %d\n",
+               c->gc_lnum, c->ihead_lnum);
+        for (i = 0; i < c->jhead_cnt; i++)
+                printk(KERN_DEBUG "\tjhead %d\t LEB %d\n",
+                       c->jheads[i].wbuf.jhead, c->jheads[i].wbuf.lnum);
+        for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) {
+                bud = rb_entry(rb, struct ubifs_bud, rb);
+                printk(KERN_DEBUG "\tbud LEB %d\n", bud->lnum);
+        }
+        list_for_each_entry(bud, &c->old_buds, list)
+                printk(KERN_DEBUG "\told bud LEB %d\n", bud->lnum);
+        list_for_each_entry(idx_gc, &c->idx_gc, list)
+                printk(KERN_DEBUG "\tGC'ed idx LEB %d unmap %d\n",
+                       idx_gc->lnum, idx_gc->unmap);
+        printk(KERN_DEBUG "\tcommit state %d\n", c->cmt_state);
+        spin_unlock(&dbg_lock);
+}
+void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp)
+{
+        printk(KERN_DEBUG "LEB %d lprops: free %d, dirty %d (used %d), "
+               "flags %#x\n", lp->lnum, lp->free, lp->dirty,
+               c->leb_size - lp->free - lp->dirty, lp->flags);
+}
+void dbg_dump_lprops(struct ubifs_info *c)
+{
+        int lnum, err;
+        struct ubifs_lprops lp;
+        struct ubifs_lp_stats lst;
+        printk(KERN_DEBUG "Dumping LEB properties\n");
+        ubifs_get_lp_stats(c, &lst);
+        dbg_dump_lstats(&lst);
+        for (lnum = c->main_first; lnum < c->leb_cnt; lnum++) {
+                err = ubifs_read_one_lp(c, lnum, &lp);
+                if (err)
+                        ubifs_err("cannot read lprops for LEB %d", lnum);
+                dbg_dump_lprop(c, &lp);
+        }
+}
+void dbg_dump_leb(const struct ubifs_info *c, int lnum)
+{
+        struct ubifs_scan_leb *sleb;
+        struct ubifs_scan_node *snod;
+        if (dbg_failure_mode)
+                return;
+        printk(KERN_DEBUG "Dumping LEB %d\n", lnum);
+        sleb = ubifs_scan(c, lnum, 0, c->dbg_buf);
+        if (IS_ERR(sleb)) {
+                ubifs_err("scan error %d", (int)PTR_ERR(sleb));
+                return;
+        }
+        printk(KERN_DEBUG "LEB %d has %d nodes ending at %d\n", lnum,
+               sleb->nodes_cnt, sleb->endpt);
+        list_for_each_entry(snod, &sleb->nodes, list) {
+                cond_resched();
+                printk(KERN_DEBUG "Dumping node at LEB %d:%d len %d\n", lnum,
+                       snod->offs, snod->len);
+                dbg_dump_node(c, snod->node);
+        }
+        ubifs_scan_destroy(sleb);
+        return;
+}
+void dbg_dump_znode(const struct ubifs_info *c,
+                    const struct ubifs_znode *znode)
+{
+        int n;
+        const struct ubifs_zbranch *zbr;
+        spin_lock(&dbg_lock);
+        if (znode->parent)
+                zbr = &znode->parent->zbranch[znode->iip];
+        else
+                zbr = &c->zroot;
+        printk(KERN_DEBUG "znode %p, LEB %d:%d len %d parent %p iip %d level %d"
+               " child_cnt %d flags %lx\n", znode, zbr->lnum, zbr->offs,
+               zbr->len, znode->parent, znode->iip, znode->level,
+               znode->child_cnt, znode->flags);
+        if (znode->child_cnt <= 0 || znode->child_cnt > c->fanout) {
+                spin_unlock(&dbg_lock);
+                return;
+        }
+        printk(KERN_DEBUG "zbranches:\n");
+        for (n = 0; n < znode->child_cnt; n++) {
+                zbr = &znode->zbranch[n];
+                if (znode->level > 0)
+                        printk(KERN_DEBUG "\t%d: znode %p LEB %d:%d len %d key "
+                                          "%s\n", n, zbr->znode, zbr->lnum,
+                                          zbr->offs, zbr->len,
+                                          DBGKEY(&zbr->key));
+                else
+                        printk(KERN_DEBUG "\t%d: LNC %p LEB %d:%d len %d key "
+                                          "%s\n", n, zbr->znode, zbr->lnum,
+                                          zbr->offs, zbr->len,
+                                          DBGKEY(&zbr->key));
+        }
+        spin_unlock(&dbg_lock);
+}
+void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat)
+{
+        int i;
+        printk(KERN_DEBUG "Dumping heap cat %d (%d elements)\n",
+               cat, heap->cnt);
+        for (i = 0; i < heap->cnt; i++) {
+                struct ubifs_lprops *lprops = heap->arr[i];
+                printk(KERN_DEBUG "\t%d. LEB %d hpos %d free %d dirty %d "
+                       "flags %d\n", i, lprops->lnum, lprops->hpos,
+                       lprops->free, lprops->dirty, lprops->flags);
+        }
+}
+void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
+                    struct ubifs_nnode *parent, int iip)
+{
+        int i;
+        printk(KERN_DEBUG "Dumping pnode:\n");
+        printk(KERN_DEBUG "\taddress %zx parent %zx cnext %zx\n",
+               (size_t)pnode, (size_t)parent, (size_t)pnode->cnext);
+        printk(KERN_DEBUG "\tflags %lu iip %d level %d num %d\n",
+               pnode->flags, iip, pnode->level, pnode->num);
+        for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+                struct ubifs_lprops *lp = &pnode->lprops[i];
+                printk(KERN_DEBUG "\t%d: free %d dirty %d flags %d lnum %d\n",
+                       i, lp->free, lp->dirty, lp->flags, lp->lnum);
+        }
+}
+void dbg_dump_tnc(struct ubifs_info *c)
+{
+        struct ubifs_znode *znode;
+        int level;
+        printk(KERN_DEBUG "\n");
+        printk(KERN_DEBUG "Dumping the TNC tree\n");
+        znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL);
+        level = znode->level;
+        printk(KERN_DEBUG "== Level %d ==\n", level);
+        while (znode) {
+                if (level != znode->level) {
+                        level = znode->level;
+                        printk(KERN_DEBUG "== Level %d ==\n", level);
+                }
+                dbg_dump_znode(c, znode);
+                znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode);
+        }
+        printk(KERN_DEBUG "\n");
+}
+static int dump_znode(struct ubifs_info *c, struct ubifs_znode *znode,
+                      void *priv)
+{
+        dbg_dump_znode(c, znode);
+        return 0;
+}
+/**
+ * dbg_dump_index - dump the on-flash index.
+ * @c: UBIFS file-system description object
+ *
+ * This function dumps whole UBIFS indexing B-tree, unlike 'dbg_dump_tnc()'
+ * which dumps only in-memory znodes and does not read znodes which from flash.
+ */
+void dbg_dump_index(struct ubifs_info *c)
+{
+        dbg_walk_index(c, NULL, dump_znode, NULL);
+}
+/**
+ * dbg_check_synced_i_size - check synchronized inode size.
+ * @inode: inode to check
+ *
+ * If inode is clean, synchronized inode size has to be equivalent to current
+ * inode size. This function has to be called only for locked inodes (@i_mutex
+ * has to be locked). Returns %0 if synchronized inode size if correct, and
+ * %-EINVAL if not.
+ */
+int dbg_check_synced_i_size(struct inode *inode)
+{
+        int err = 0;
+        struct ubifs_inode *ui = ubifs_inode(inode);
+        if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+                return 0;
+        if (!S_ISREG(inode->i_mode))
+                return 0;
+        mutex_lock(&ui->ui_mutex);
+        spin_lock(&ui->ui_lock);
+        if (ui->ui_size != ui->synced_i_size && !ui->dirty) {
+                ubifs_err("ui_size is %lld, synced_i_size is %lld, but inode "
+                          "is clean", ui->ui_size, ui->synced_i_size);
+                ubifs_err("i_ino %lu, i_mode %#x, i_size %lld", inode->i_ino,
+                          inode->i_mode, i_size_read(inode));
+                dbg_dump_stack();
+                err = -EINVAL;
+        }
+        spin_unlock(&ui->ui_lock);
+        mutex_unlock(&ui->ui_mutex);
+        return err;
+}
+/*
+ * dbg_check_dir - check directory inode size and link count.
+ * @c: UBIFS file-system description object
+ * @dir: the directory to calculate size for
+ * @size: the result is returned here
+ *
+ * This function makes sure that directory size and link count are correct.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ *
+ * Note, it is good idea to make sure the @dir->i_mutex is locked before
+ * calling this function.
+ */
+int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir)
+{
+        unsigned int nlink = 2;
+        union ubifs_key key;
+        struct ubifs_dent_node *dent, *pdent = NULL;
+        struct qstr nm = { .name = NULL };
+        loff_t size = UBIFS_INO_NODE_SZ;
+        if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+                return 0;
+        if (!S_ISDIR(dir->i_mode))
+                return 0;
+        lowest_dent_key(c, &key, dir->i_ino);
+        while (1) {
+                int err;
+                dent = ubifs_tnc_next_ent(c, &key, &nm);
+                if (IS_ERR(dent)) {
+                        err = PTR_ERR(dent);
+                        if (err == -ENOENT)
+                                break;
+                        return err;
+                }
+                nm.name = dent->name;
+                nm.len = le16_to_cpu(dent->nlen);
+                size += CALC_DENT_SIZE(nm.len);
+                if (dent->type == UBIFS_ITYPE_DIR)
+                        nlink += 1;
+                kfree(pdent);
+                pdent = dent;
+                key_read(c, &dent->key, &key);
+        }
+        kfree(pdent);
+        if (i_size_read(dir) != size) {
+                ubifs_err("directory inode %lu has size %llu, "
+                          "but calculated size is %llu", dir->i_ino,
+                          (unsigned long long)i_size_read(dir),
+                          (unsigned long long)size);
+                dump_stack();
+                return -EINVAL;
+        }
+        if (dir->i_nlink != nlink) {
+                ubifs_err("directory inode %lu has nlink %u, but calculated "
+                          "nlink is %u", dir->i_ino, dir->i_nlink, nlink);
+                dump_stack();
+                return -EINVAL;
+        }
+        return 0;
+}
+/**
+ * dbg_check_key_order - make sure that colliding keys are properly ordered.
+ * @c: UBIFS file-system description object
+ * @zbr1: first zbranch
+ * @zbr2: following zbranch
+ *
+ * In UBIFS indexing B-tree colliding keys has to be sorted in binary order of
+ * names of the direntries/xentries which are referred by the keys. This
+ * function reads direntries/xentries referred by @zbr1 and @zbr2 and makes
+ * sure the name of direntry/xentry referred by @zbr1 is less than
+ * direntry/xentry referred by @zbr2. Returns zero if this is true, %1 if not,
+ * and a negative error code in case of failure.
+ */
+static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
+                               struct ubifs_zbranch *zbr2)
+{
+        int err, nlen1, nlen2, cmp;
+        struct ubifs_dent_node *dent1, *dent2;
+        union ubifs_key key;
+        ubifs_assert(!keys_cmp(c, &zbr1->key, &zbr2->key));
+        dent1 = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS);
+        if (!dent1)
+                return -ENOMEM;
+        dent2 = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS);
+        if (!dent2) {
+                err = -ENOMEM;
+                goto out_free;
+        }
+        err = ubifs_tnc_read_node(c, zbr1, dent1);
+        if (err)
+                goto out_free;
+        err = ubifs_validate_entry(c, dent1);
+        if (err)
+                goto out_free;
+        err = ubifs_tnc_read_node(c, zbr2, dent2);
+        if (err)
+                goto out_free;
+        err = ubifs_validate_entry(c, dent2);
+        if (err)
+                goto out_free;
+        /* Make sure node keys are the same as in zbranch */
+        err = 1;
+        key_read(c, &dent1->key, &key);
+        if (keys_cmp(c, &zbr1->key, &key)) {
+                dbg_err("1st entry at %d:%d has key %s", zbr1->lnum,
+                        zbr1->offs, DBGKEY(&key));
+                dbg_err("but it should have key %s according to tnc",
+                        DBGKEY(&zbr1->key));
+                        dbg_dump_node(c, dent1);
+                        goto out_free;
+        }
+        key_read(c, &dent2->key, &key);
+        if (keys_cmp(c, &zbr2->key, &key)) {
+                dbg_err("2nd entry at %d:%d has key %s", zbr1->lnum,
+                        zbr1->offs, DBGKEY(&key));
+                dbg_err("but it should have key %s according to tnc",
+                        DBGKEY(&zbr2->key));
+                        dbg_dump_node(c, dent2);
+                        goto out_free;
+        }
+        nlen1 = le16_to_cpu(dent1->nlen);
+        nlen2 = le16_to_cpu(dent2->nlen);
+        cmp = memcmp(dent1->name, dent2->name, min_t(int, nlen1, nlen2));
+        if (cmp < 0 || (cmp == 0 && nlen1 < nlen2)) {
+                err = 0;
+                goto out_free;
+        }
+        if (cmp == 0 && nlen1 == nlen2)
+                dbg_err("2 xent/dent nodes with the same name");
+        else
+                dbg_err("bad order of colliding key %s",
+                        DBGKEY(&key));
+        dbg_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs);
+        dbg_dump_node(c, dent1);
+        dbg_msg("second node at %d:%d\n", zbr2->lnum, zbr2->offs);
+        dbg_dump_node(c, dent2);
+out_free:
+        kfree(dent2);
+        kfree(dent1);
+        return err;
+}
+/**
+ * dbg_check_znode - check if znode is all right.
+ * @c: UBIFS file-system description object
+ * @zbr: zbranch which points to this znode
+ *
+ * This function makes sure that znode referred to by @zbr is all right.
+ * Returns zero if it is, and %-EINVAL if it is not.
+ */
+static int dbg_check_znode(struct ubifs_info *c, struct ubifs_zbranch *zbr)
+{
+        struct ubifs_znode *znode = zbr->znode;
+        struct ubifs_znode *zp = znode->parent;
+        int n, err, cmp;
+        if (znode->child_cnt <= 0 || znode->child_cnt > c->fanout) {
+                err = 1;
+                goto out;
+        }
+        if (znode->level < 0) {
+                err = 2;
+                goto out;
+        }
+        if (znode->iip < 0 || znode->iip >= c->fanout) {
+                err = 3;
+                goto out;
+        }
+        if (zbr->len == 0)
+                /* Only dirty zbranch may have no on-flash nodes */
+                if (!ubifs_zn_dirty(znode)) {
+                        err = 4;
+                        goto out;
+                }
+        if (ubifs_zn_dirty(znode)) {
+                /*
+                 * If znode is dirty, its parent has to be dirty as well. The
+                 * order of the operation is important, so we have to have
+                 * memory barriers.
+                 */
+                smp_mb();
+                if (zp && !ubifs_zn_dirty(zp)) {
+                        /*
+                         * The dirty flag is atomic and is cleared outside the
+                         * TNC mutex, so znode's dirty flag may now have
+                         * been cleared. The child is always cleared before the
+                         * parent, so we just need to check again.
+                         */
+                        smp_mb();
+                        if (ubifs_zn_dirty(znode)) {
+                                err = 5;
+                                goto out;
+                        }
+                }
+        }
+        if (zp) {
+                const union ubifs_key *min, *max;
+                if (znode->level != zp->level - 1) {
+                        err = 6;
+                        goto out;
+                }
+                /* Make sure the 'parent' pointer in our znode is correct */
+                err = ubifs_search_zbranch(c, zp, &zbr->key, &n);
+                if (!err) {
+                        /* This zbranch does not exist in the parent */
+                        err = 7;
+                        goto out;
+                }
+                if (znode->iip >= zp->child_cnt) {
+                        err = 8;
+                        goto out;
+                }
+                if (znode->iip != n) {
+                        /* This may happen only in case of collisions */
+                        if (keys_cmp(c, &zp->zbranch[n].key,
+                                     &zp->zbranch[znode->iip].key)) {
+                                err = 9;
+                                goto out;
+                        }
+                        n = znode->iip;
+                }
+                /*
+                 * Make sure that the first key in our znode is greater than or
+                 * equal to the key in the pointing zbranch.
+                 */
+                min = &zbr->key;
+                cmp = keys_cmp(c, min, &znode->zbranch[0].key);
+                if (cmp == 1) {
+                        err = 10;
+                        goto out;
+                }
+                if (n + 1 < zp->child_cnt) {
+                        max = &zp->zbranch[n + 1].key;
+                        /*
+                         * Make sure the last key in our znode is less or
+                         * equivalent than the the key in zbranch which goes
+                         * after our pointing zbranch.
+                         */
+                        cmp = keys_cmp(c, max,
+                                &znode->zbranch[znode->child_cnt - 1].key);
+                        if (cmp == -1) {
+                                err = 11;
+                                goto out;
+                        }
+                }
+        } else {
+                /* This may only be root znode */
+                if (zbr != &c->zroot) {
+                        err = 12;
+                        goto out;
+                }
+        }
+        /*
+         * Make sure that next key is greater or equivalent then the previous
+         * one.
+         */
+        for (n = 1; n < znode->child_cnt; n++) {
+                cmp = keys_cmp(c, &znode->zbranch[n - 1].key,
+                               &znode->zbranch[n].key);
+                if (cmp > 0) {
+                        err = 13;
+                        goto out;
+                }
+                if (cmp == 0) {
+                        /* This can only be keys with colliding hash */
+                        if (!is_hash_key(c, &znode->zbranch[n].key)) {
+                                err = 14;
+                                goto out;
+                        }
+                        if (znode->level != 0 || c->replaying)
+                                continue;
+                        /*
+                         * Colliding keys should follow binary order of
+                         * corresponding xentry/dentry names.
+                         */
+                        err = dbg_check_key_order(c, &znode->zbranch[n - 1],
+                                                  &znode->zbranch[n]);
+                        if (err < 0)
+                                return err;
+                        if (err) {
+                                err = 15;
+                                goto out;
+                        }
+                }
+        }
+        for (n = 0; n < znode->child_cnt; n++) {
+                if (!znode->zbranch[n].znode &&
+                    (znode->zbranch[n].lnum == 0 ||
+                     znode->zbranch[n].len == 0)) {
+                        err = 16;
+                        goto out;
+                }
+                if (znode->zbranch[n].lnum != 0 &&
+                    znode->zbranch[n].len == 0) {
+                        err = 17;
+                        goto out;
+                }
+                if (znode->zbranch[n].lnum == 0 &&
+                    znode->zbranch[n].len != 0) {
+                        err = 18;
+                        goto out;
+                }
+                if (znode->zbranch[n].lnum == 0 &&
+                    znode->zbranch[n].offs != 0) {
+                        err = 19;
+                        goto out;
+                }
+                if (znode->level != 0 && znode->zbranch[n].znode)
+                        if (znode->zbranch[n].znode->parent != znode) {
+                                err = 20;
+                                goto out;
+                        }
+        }
+        return 0;
+out:
+        ubifs_err("failed, error %d", err);
+        ubifs_msg("dump of the znode");
+        dbg_dump_znode(c, znode);
+        if (zp) {
+                ubifs_msg("dump of the parent znode");
+                dbg_dump_znode(c, zp);
+        }
+        dump_stack();
+        return -EINVAL;
+}
+/**
+ * dbg_check_tnc - check TNC tree.
+ * @c: UBIFS file-system description object
+ * @extra: do extra checks that are possible at start commit
+ *
+ * This function traverses whole TNC tree and checks every znode. Returns zero
+ * if everything is all right and %-EINVAL if something is wrong with TNC.
+ */
+int dbg_check_tnc(struct ubifs_info *c, int extra)
+{
+        struct ubifs_znode *znode;
+        long clean_cnt = 0, dirty_cnt = 0;
+        int err, last;
+        if (!(ubifs_chk_flags & UBIFS_CHK_TNC))
+                return 0;
+        ubifs_assert(mutex_is_locked(&c->tnc_mutex));
+        if (!c->zroot.znode)
+                return 0;
+        znode = ubifs_tnc_postorder_first(c->zroot.znode);
+        while (1) {
+                struct ubifs_znode *prev;
+                struct ubifs_zbranch *zbr;
+                if (!znode->parent)
+                        zbr = &c->zroot;
+                else
+                        zbr = &znode->parent->zbranch[znode->iip];
+                err = dbg_check_znode(c, zbr);
+                if (err)
+                        return err;
+                if (extra) {
+                        if (ubifs_zn_dirty(znode))
+                                dirty_cnt += 1;
+                        else
+                                clean_cnt += 1;
+                }
+                prev = znode;
+                znode = ubifs_tnc_postorder_next(znode);
+                if (!znode)
+                        break;
+                /*
+                 * If the last key of this znode is equivalent to the first key
+                 * of the next znode (collision), then check order of the keys.
+                 */
+                last = prev->child_cnt - 1;
+                if (prev->level == 0 && znode->level == 0 && !c->replaying &&
+                    !keys_cmp(c, &prev->zbranch[last].key,
+                              &znode->zbranch[0].key)) {
+                        err = dbg_check_key_order(c, &prev->zbranch[last],
+                                                  &znode->zbranch[0]);
+                        if (err < 0)
+                                return err;
+                        if (err) {
+                                ubifs_msg("first znode");
+                                dbg_dump_znode(c, prev);
+                                ubifs_msg("second znode");
+                                dbg_dump_znode(c, znode);
+                                return -EINVAL;
+                        }
+                }
+        }
+        if (extra) {
+                if (clean_cnt != atomic_long_read(&c->clean_zn_cnt)) {
+                        ubifs_err("incorrect clean_zn_cnt %ld, calculated %ld",
+                                  atomic_long_read(&c->clean_zn_cnt),
+                                  clean_cnt);
+                        return -EINVAL;
+                }
+                if (dirty_cnt != atomic_long_read(&c->dirty_zn_cnt)) {
+                        ubifs_err("incorrect dirty_zn_cnt %ld, calculated %ld",
+                                  atomic_long_read(&c->dirty_zn_cnt),
+                                  dirty_cnt);
+                        return -EINVAL;
+                }
+        }
+        return 0;
+}
+/**
+ * dbg_walk_index - walk the on-flash index.
+ * @c: UBIFS file-system description object
+ * @leaf_cb: called for each leaf node
+ * @znode_cb: called for each indexing node
+ * @priv: private date which is passed to callbacks
+ *
+ * This function walks the UBIFS index and calls the @leaf_cb for each leaf
+ * node and @znode_cb for each indexing node. Returns zero in case of success
+ * and a negative error code in case of failure.
+ *
+ * It would be better if this function removed every znode it pulled to into
+ * the TNC, so that the behavior more closely matched the non-debugging
+ * behavior.
+ */
+int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb,
+                   dbg_znode_callback znode_cb, void *priv)
+{
+        int err;
+        struct ubifs_zbranch *zbr;
+        struct ubifs_znode *znode, *child;
+        mutex_lock(&c->tnc_mutex);
+        /* If the root indexing node is not in TNC - pull it */
+        if (!c->zroot.znode) {
+                c->zroot.znode = ubifs_load_znode(c, &c->zroot, NULL, 0);
+                if (IS_ERR(c->zroot.znode)) {
+                        err = PTR_ERR(c->zroot.znode);
+                        c->zroot.znode = NULL;
+                        goto out_unlock;
+                }
+        }
+        /*
+         * We are going to traverse the indexing tree in the postorder manner.
+         * Go down and find the leftmost indexing node where we are going to
+         * start from.
+         */
+        znode = c->zroot.znode;
+        while (znode->level > 0) {
+                zbr = &znode->zbranch[0];
+                child = zbr->znode;
+                if (!child) {
+                        child = ubifs_load_znode(c, zbr, znode, 0);
+                        if (IS_ERR(child)) {
+                                err = PTR_ERR(child);
+                                goto out_unlock;
+                        }
+                        zbr->znode = child;
+                }
+                znode = child;
+        }
+        /* Iterate over all indexing nodes */
+        while (1) {
+                int idx;
+                cond_resched();
+                if (znode_cb) {
+                        err = znode_cb(c, znode, priv);
+                        if (err) {
+                                ubifs_err("znode checking function returned "
+                                          "error %d", err);
+                                dbg_dump_znode(c, znode);
+                                goto out_dump;
+                        }
+                }
+                if (leaf_cb && znode->level == 0) {
+                        for (idx = 0; idx < znode->child_cnt; idx++) {
+                                zbr = &znode->zbranch[idx];
+                                err = leaf_cb(c, zbr, priv);
+                                if (err) {
+                                        ubifs_err("leaf checking function "
+                                                  "returned error %d, for leaf "
+                                                  "at LEB %d:%d",
+                                                  err, zbr->lnum, zbr->offs);
+                                        goto out_dump;
+                                }
+                        }
+                }
+                if (!znode->parent)
+                        break;
+                idx = znode->iip + 1;
+                znode = znode->parent;
+                if (idx < znode->child_cnt) {
+                        /* Switch to the next index in the parent */
+                        zbr = &znode->zbranch[idx];
+                        child = zbr->znode;
+                        if (!child) {
+                                child = ubifs_load_znode(c, zbr, znode, idx);
+                                if (IS_ERR(child)) {
+                                        err = PTR_ERR(child);
+                                        goto out_unlock;
+                                }
+                                zbr->znode = child;
+                        }
+                        znode = child;
+                } else
+                        /*
+                         * This is the last child, switch to the parent and
+                         * continue.
+                         */
+                        continue;
+                /* Go to the lowest leftmost znode in the new sub-tree */
+                while (znode->level > 0) {
+                        zbr = &znode->zbranch[0];
+                        child = zbr->znode;
+                        if (!child) {
+                                child = ubifs_load_znode(c, zbr, znode, 0);
+                                if (IS_ERR(child)) {
+                                        err = PTR_ERR(child);
+                                        goto out_unlock;
+                                }
+                                zbr->znode = child;
+                        }
+                        znode = child;
+                }
+        }
+        mutex_unlock(&c->tnc_mutex);
+        return 0;
+out_dump:
+        if (znode->parent)
+                zbr = &znode->parent->zbranch[znode->iip];
+        else
+                zbr = &c->zroot;
+        ubifs_msg("dump of znode at LEB %d:%d", zbr->lnum, zbr->offs);
+        dbg_dump_znode(c, znode);
+out_unlock:
+        mutex_unlock(&c->tnc_mutex);
+        return err;
+}
+/**
+ * add_size - add znode size to partially calculated index size.
+ * @c: UBIFS file-system description object
+ * @znode: znode to add size for
+ * @priv: partially calculated index size
+ *
+ * This is a helper function for 'dbg_check_idx_size()' which is called for
+ * every indexing node and adds its size to the 'long long' variable pointed to
+ * by @priv.
+ */
+static int add_size(struct ubifs_info *c, struct ubifs_znode *znode, void *priv)
+{
+        long long *idx_size = priv;
+        int add;
+        add = ubifs_idx_node_sz(c, znode->child_cnt);
+        add = ALIGN(add, 8);
+        *idx_size += add;
+        return 0;
+}
+/**
+ * dbg_check_idx_size - check index size.
+ * @c: UBIFS file-system description object
+ * @idx_size: size to check
+ *
+ * This function walks the UBIFS index, calculates its size and checks that the
+ * size is equivalent to @idx_size. Returns zero in case of success and a
+ * negative error code in case of failure.
+ */
+int dbg_check_idx_size(struct ubifs_info *c, long long idx_size)
+{
+        int err;
+        long long calc = 0;
+        if (!(ubifs_chk_flags & UBIFS_CHK_IDX_SZ))
+                return 0;
+        err = dbg_walk_index(c, NULL, add_size, &calc);
+        if (err) {
+                ubifs_err("error %d while walking the index", err);
+                return err;
+        }
+        if (calc != idx_size) {
+                ubifs_err("index size check failed: calculated size is %lld, "
+                          "should be %lld", calc, idx_size);
+                dump_stack();
+                return -EINVAL;
+        }
+        return 0;
+}
+/**
+ * struct fsck_inode - information about an inode used when checking the file-system.
+ * @rb: link in the RB-tree of inodes
+ * @inum: inode number
+ * @mode: inode type, permissions, etc
+ * @nlink: inode link count
+ * @xattr_cnt: count of extended attributes
+ * @references: how many directory/xattr entries refer this inode (calculated
+ *              while walking the index)
+ * @calc_cnt: for directory inode count of child directories
+ * @size: inode size (read from on-flash inode)
+ * @xattr_sz: summary size of all extended attributes (read from on-flash
+ *            inode)
+ * @calc_sz: for directories calculated directory size
+ * @calc_xcnt: count of extended attributes
+ * @calc_xsz: calculated summary size of all extended attributes
+ * @xattr_nms: sum of lengths of all extended attribute names belonging to this
+ *             inode (read from on-flash inode)
+ * @calc_xnms: calculated sum of lengths of all extended attribute names
+ */
+struct fsck_inode {
+        struct rb_node rb;
+        ino_t inum;
+        umode_t mode;
+        unsigned int nlink;
+        unsigned int xattr_cnt;
+        int references;
+        int calc_cnt;
+        long long size;
+        unsigned int xattr_sz;
+        long long calc_sz;
+        long long calc_xcnt;
+        long long calc_xsz;
+        unsigned int xattr_nms;
+        long long calc_xnms;
+};
+/**
+ * struct fsck_data - private FS checking information.
+ * @inodes: RB-tree of all inodes (contains @struct fsck_inode objects)
+ */
+struct fsck_data {
+        struct rb_root inodes;
+};
+/**
+ * add_inode - add inode information to RB-tree of inodes.
+ * @c: UBIFS file-system description object
+ * @fsckd: FS checking information
+ * @ino: raw UBIFS inode to add
+ *
+ * This is a helper function for 'check_leaf()' which adds information about
+ * inode @ino to the RB-tree of inodes. Returns inode information pointer in
+ * case of success and a negative error code in case of failure.
+ */
+static struct fsck_inode *add_inode(struct ubifs_info *c,
+                                    struct fsck_data *fsckd,
+                                    struct ubifs_ino_node *ino)
+{
+        struct rb_node **p, *parent = NULL;
+        struct fsck_inode *fscki;
+        ino_t inum = key_inum_flash(c, &ino->key);
+        p = &fsckd->inodes.rb_node;
+        while (*p) {
+                parent = *p;
+                fscki = rb_entry(parent, struct fsck_inode, rb);
+                if (inum < fscki->inum)
+                        p = &(*p)->rb_left;
+                else if (inum > fscki->inum)
+                        p = &(*p)->rb_right;
+                else
+                        return fscki;
+        }
+        if (inum > c->highest_inum) {
+                ubifs_err("too high inode number, max. is %lu",
+                          c->highest_inum);
+                return ERR_PTR(-EINVAL);
+        }
+        fscki = kzalloc(sizeof(struct fsck_inode), GFP_NOFS);
+        if (!fscki)
+                return ERR_PTR(-ENOMEM);
+        fscki->inum = inum;
+        fscki->nlink = le32_to_cpu(ino->nlink);
+        fscki->size = le64_to_cpu(ino->size);
+        fscki->xattr_cnt = le32_to_cpu(ino->xattr_cnt);
+        fscki->xattr_sz = le32_to_cpu(ino->xattr_size);
+        fscki->xattr_nms = le32_to_cpu(ino->xattr_names);
+        fscki->mode = le32_to_cpu(ino->mode);
+        if (S_ISDIR(fscki->mode)) {
+                fscki->calc_sz = UBIFS_INO_NODE_SZ;
+                fscki->calc_cnt = 2;
+        }
+        rb_link_node(&fscki->rb, parent, p);
+        rb_insert_color(&fscki->rb, &fsckd->inodes);
+        return fscki;
+}
+/**
+ * search_inode - search inode in the RB-tree of inodes.
+ * @fsckd: FS checking information
+ * @inum: inode number to search
+ *
+ * This is a helper function for 'check_leaf()' which searches inode @inum in
+ * the RB-tree of inodes and returns an inode information pointer or %NULL if
+ * the inode was not found.
+ */
+static struct fsck_inode *search_inode(struct fsck_data *fsckd, ino_t inum)
+{
+        struct rb_node *p;
+        struct fsck_inode *fscki;
+        p = fsckd->inodes.rb_node;
+        while (p) {
+                fscki = rb_entry(p, struct fsck_inode, rb);
+                if (inum < fscki->inum)
+                        p = p->rb_left;
+                else if (inum > fscki->inum)
+                        p = p->rb_right;
+                else
+                        return fscki;
+        }
+        return NULL;
+}
+/**
+ * read_add_inode - read inode node and add it to RB-tree of inodes.
+ * @c: UBIFS file-system description object
+ * @fsckd: FS checking information
+ * @inum: inode number to read
+ *
+ * This is a helper function for 'check_leaf()' which finds inode node @inum in
+ * the index, reads it, and adds it to the RB-tree of inodes. Returns inode
+ * information pointer in case of success and a negative error code in case of
+ * failure.
+ */
+static struct fsck_inode *read_add_inode(struct ubifs_info *c,
+                                         struct fsck_data *fsckd, ino_t inum)
+{
+        int n, err;
+        union ubifs_key key;
+        struct ubifs_znode *znode;
+        struct ubifs_zbranch *zbr;
+        struct ubifs_ino_node *ino;
+        struct fsck_inode *fscki;
+        fscki = search_inode(fsckd, inum);
+        if (fscki)
+                return fscki;
+        ino_key_init(c, &key, inum);
+        err = ubifs_lookup_level0(c, &key, &znode, &n);
+        if (!err) {
+                ubifs_err("inode %lu not found in index", inum);
+                return ERR_PTR(-ENOENT);
+        } else if (err < 0) {
+                ubifs_err("error %d while looking up inode %lu", err, inum);
+                return ERR_PTR(err);
+        }
+        zbr = &znode->zbranch[n];
+        if (zbr->len < UBIFS_INO_NODE_SZ) {
+                ubifs_err("bad node %lu node length %d", inum, zbr->len);
+                return ERR_PTR(-EINVAL);
+        }
+        ino = kmalloc(zbr->len, GFP_NOFS);
+        if (!ino)
+                return ERR_PTR(-ENOMEM);
+        err = ubifs_tnc_read_node(c, zbr, ino);
+        if (err) {
+                ubifs_err("cannot read inode node at LEB %d:%d, error %d",
+                          zbr->lnum, zbr->offs, err);
+                kfree(ino);
+                return ERR_PTR(err);
+        }
+        fscki = add_inode(c, fsckd, ino);
+        kfree(ino);
+        if (IS_ERR(fscki)) {
+                ubifs_err("error %ld while adding inode %lu node",
+                          PTR_ERR(fscki), inum);
+                return fscki;
+        }
+        return fscki;
+}
+/**
+ * check_leaf - check leaf node.
+ * @c: UBIFS file-system description object
+ * @zbr: zbranch of the leaf node to check
+ * @priv: FS checking information
+ *
+ * This is a helper function for 'dbg_check_filesystem()' which is called for
+ * every single leaf node while walking the indexing tree. It checks that the
+ * leaf node referred from the indexing tree exists, has correct CRC, and does
+ * some other basic validation. This function is also responsible for building
+ * an RB-tree of inodes - it adds all inodes into the RB-tree. It also
+ * calculates reference count, size, etc for each inode in order to later
+ * compare them to the information stored inside the inodes and detect possible
+ * inconsistencies. Returns zero in case of success and a negative error code
+ * in case of failure.
+ */
+static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+                      void *priv)
+{
+        ino_t inum;
+        void *node;
+        struct ubifs_ch *ch;
+        int err, type = key_type(c, &zbr->key);
+        struct fsck_inode *fscki;
+        if (zbr->len < UBIFS_CH_SZ) {
+                ubifs_err("bad leaf length %d (LEB %d:%d)",
+                          zbr->len, zbr->lnum, zbr->offs);
+                return -EINVAL;
+        }
+        node = kmalloc(zbr->len, GFP_NOFS);
+        if (!node)
+                return -ENOMEM;
+        err = ubifs_tnc_read_node(c, zbr, node);
+        if (err) {
+                ubifs_err("cannot read leaf node at LEB %d:%d, error %d",
+                          zbr->lnum, zbr->offs, err);
+                goto out_free;
+        }
+        /* If this is an inode node, add it to RB-tree of inodes */
+        if (type == UBIFS_INO_KEY) {
+                fscki = add_inode(c, priv, node);
+                if (IS_ERR(fscki)) {
+                        err = PTR_ERR(fscki);
+                        ubifs_err("error %d while adding inode node", err);
+                        goto out_dump;
+                }
+                goto out;
+        }
+        if (type != UBIFS_DENT_KEY && type != UBIFS_XENT_KEY &&
+            type != UBIFS_DATA_KEY) {
+                ubifs_err("unexpected node type %d at LEB %d:%d",
+                          type, zbr->lnum, zbr->offs);
+                err = -EINVAL;
+                goto out_free;
+        }
+        ch = node;
+        if (le64_to_cpu(ch->sqnum) > c->max_sqnum) {
+                ubifs_err("too high sequence number, max. is %llu",
+                          c->max_sqnum);
+                err = -EINVAL;
+                goto out_dump;
+        }
+        if (type == UBIFS_DATA_KEY) {
+                long long blk_offs;
+                struct ubifs_data_node *dn = node;
+                /*
+                 * Search the inode node this data node belongs to and insert
+                 * it to the RB-tree of inodes.
+                 */
+                inum = key_inum_flash(c, &dn->key);
+                fscki = read_add_inode(c, priv, inum);
+                if (IS_ERR(fscki)) {
+                        err = PTR_ERR(fscki);
+                        ubifs_err("error %d while processing data node and "
+                                  "trying to find inode node %lu", err, inum);
+                        goto out_dump;
+                }
+                /* Make sure the data node is within inode size */
+                blk_offs = key_block_flash(c, &dn->key);
+                blk_offs <<= UBIFS_BLOCK_SHIFT;
+                blk_offs += le32_to_cpu(dn->size);
+                if (blk_offs > fscki->size) {
+                        ubifs_err("data node at LEB %d:%d is not within inode "
+                                  "size %lld", zbr->lnum, zbr->offs,
+                                  fscki->size);
+                        err = -EINVAL;
+                        goto out_dump;
+                }
+        } else {
+                int nlen;
+                struct ubifs_dent_node *dent = node;
+                struct fsck_inode *fscki1;
+                err = ubifs_validate_entry(c, dent);
+                if (err)
+                        goto out_dump;
+                /*
+                 * Search the inode node this entry refers to and the parent
+                 * inode node and insert them to the RB-tree of inodes.
+                 */
+                inum = le64_to_cpu(dent->inum);
+                fscki = read_add_inode(c, priv, inum);
+                if (IS_ERR(fscki)) {
+                        err = PTR_ERR(fscki);
+                        ubifs_err("error %d while processing entry node and "
+                                  "trying to find inode node %lu", err, inum);
+                        goto out_dump;
+                }
+                /* Count how many direntries or xentries refers this inode */
+                fscki->references += 1;
+                inum = key_inum_flash(c, &dent->key);
+                fscki1 = read_add_inode(c, priv, inum);
+                if (IS_ERR(fscki1)) {
+                        err = PTR_ERR(fscki);
+                        ubifs_err("error %d while processing entry node and "
+                                  "trying to find parent inode node %lu",
+                                  err, inum);
+                        goto out_dump;
+                }
+                nlen = le16_to_cpu(dent->nlen);
+                if (type == UBIFS_XENT_KEY) {
+                        fscki1->calc_xcnt += 1;
+                        fscki1->calc_xsz += CALC_DENT_SIZE(nlen);
+                        fscki1->calc_xsz += CALC_XATTR_BYTES(fscki->size);
+                        fscki1->calc_xnms += nlen;
+                } else {
+                        fscki1->calc_sz += CALC_DENT_SIZE(nlen);
+                        if (dent->type == UBIFS_ITYPE_DIR)
+                                fscki1->calc_cnt += 1;
+                }
+        }
+out:
+        kfree(node);
+        return 0;
+out_dump:
+        ubifs_msg("dump of node at LEB %d:%d", zbr->lnum, zbr->offs);
+        dbg_dump_node(c, node);
+out_free:
+        kfree(node);
+        return err;
+}
+/**
+ * free_inodes - free RB-tree of inodes.
+ * @fsckd: FS checking information
+ */
+static void free_inodes(struct fsck_data *fsckd)
+{
+        struct rb_node *this = fsckd->inodes.rb_node;
+        struct fsck_inode *fscki;
+        while (this) {
+                if (this->rb_left)
+                        this = this->rb_left;
+                else if (this->rb_right)
+                        this = this->rb_right;
+                else {
+                        fscki = rb_entry(this, struct fsck_inode, rb);
+                        this = rb_parent(this);
+                        if (this) {
+                                if (this->rb_left == &fscki->rb)
+                                        this->rb_left = NULL;
+                                else
+                                        this->rb_right = NULL;
+                        }
+                        kfree(fscki);
+                }
+        }
+}
+/**
+ * check_inodes - checks all inodes.
+ * @c: UBIFS file-system description object
+ * @fsckd: FS checking information
+ *
+ * This is a helper function for 'dbg_check_filesystem()' which walks the
+ * RB-tree of inodes after the index scan has been finished, and checks that
+ * inode nlink, size, etc are correct. Returns zero if inodes are fine,
+ * %-EINVAL if not, and a negative error code in case of failure.
+ */
+static int check_inodes(struct ubifs_info *c, struct fsck_data *fsckd)
+{
+        int n, err;
+        union ubifs_key key;
+        struct ubifs_znode *znode;
+        struct ubifs_zbranch *zbr;
+        struct ubifs_ino_node *ino;
+        struct fsck_inode *fscki;
+        struct rb_node *this = rb_first(&fsckd->inodes);
+        while (this) {
+                fscki = rb_entry(this, struct fsck_inode, rb);
+                this = rb_next(this);
+                if (S_ISDIR(fscki->mode)) {
+                        /*
+                         * Directories have to have exactly one reference (they
+                         * cannot have hardlinks), although root inode is an
+                         * exception.
+                         */
+                        if (fscki->inum != UBIFS_ROOT_INO &&
+                            fscki->references != 1) {
+                                ubifs_err("directory inode %lu has %d "
+                                          "direntries which refer it, but "
+                                          "should be 1", fscki->inum,
+                                          fscki->references);
+                                goto out_dump;
+                        }
+                        if (fscki->inum == UBIFS_ROOT_INO &&
+                            fscki->references != 0) {
+                                ubifs_err("root inode %lu has non-zero (%d) "
+                                          "direntries which refer it",
+                                          fscki->inum, fscki->references);
+                                goto out_dump;
+                        }
+                        if (fscki->calc_sz != fscki->size) {
+                                ubifs_err("directory inode %lu size is %lld, "
+                                          "but calculated size is %lld",
+                                          fscki->inum, fscki->size,
+                                          fscki->calc_sz);
+                                goto out_dump;
+                        }
+                        if (fscki->calc_cnt != fscki->nlink) {
+                                ubifs_err("directory inode %lu nlink is %d, "
+                                          "but calculated nlink is %d",
+                                          fscki->inum, fscki->nlink,
+                                          fscki->calc_cnt);
+                                goto out_dump;
+                        }
+                } else {
+                        if (fscki->references != fscki->nlink) {
+                                ubifs_err("inode %lu nlink is %d, but "
+                                          "calculated nlink is %d", fscki->inum,
+                                          fscki->nlink, fscki->references);
+                                goto out_dump;
+                        }
+                }
+                if (fscki->xattr_sz != fscki->calc_xsz) {
+                        ubifs_err("inode %lu has xattr size %u, but "
+                                  "calculated size is %lld",
+                                  fscki->inum, fscki->xattr_sz,
+                                  fscki->calc_xsz);
+                        goto out_dump;
+                }
+                if (fscki->xattr_cnt != fscki->calc_xcnt) {
+                        ubifs_err("inode %lu has %u xattrs, but "
+                                  "calculated count is %lld", fscki->inum,
+                                  fscki->xattr_cnt, fscki->calc_xcnt);
+                        goto out_dump;
+                }
+                if (fscki->xattr_nms != fscki->calc_xnms) {
+                        ubifs_err("inode %lu has xattr names' size %u, but "
+                                  "calculated names' size is %lld",
+                                  fscki->inum, fscki->xattr_nms,
+                                  fscki->calc_xnms);
+                        goto out_dump;
+                }
+        }
+        return 0;
+out_dump:
+        /* Read the bad inode and dump it */
+        ino_key_init(c, &key, fscki->inum);
+        err = ubifs_lookup_level0(c, &key, &znode, &n);
+        if (!err) {
+                ubifs_err("inode %lu not found in index", fscki->inum);
+                return -ENOENT;
+        } else if (err < 0) {
+                ubifs_err("error %d while looking up inode %lu",
+                          err, fscki->inum);
+                return err;
+        }
+        zbr = &znode->zbranch[n];
+        ino = kmalloc(zbr->len, GFP_NOFS);
+        if (!ino)
+                return -ENOMEM;
+        err = ubifs_tnc_read_node(c, zbr, ino);
+        if (err) {
+                ubifs_err("cannot read inode node at LEB %d:%d, error %d",
+                          zbr->lnum, zbr->offs, err);
+                kfree(ino);
+                return err;
+        }
+        ubifs_msg("dump of the inode %lu sitting in LEB %d:%d",
+                  fscki->inum, zbr->lnum, zbr->offs);
+        dbg_dump_node(c, ino);
+        kfree(ino);
+        return -EINVAL;
+}
+/**
+ * dbg_check_filesystem - check the file-system.
+ * @c: UBIFS file-system description object
+ *
+ * This function checks the file system, namely:
+ * o makes sure that all leaf nodes exist and their CRCs are correct;
+ * o makes sure inode nlink, size, xattr size/count are correct (for all
+ *   inodes).
+ *
+ * The function reads whole indexing tree and all nodes, so it is pretty
+ * heavy-weight. Returns zero if the file-system is consistent, %-EINVAL if
+ * not, and a negative error code in case of failure.
+ */
+int dbg_check_filesystem(struct ubifs_info *c)
+{
+        int err;
+        struct fsck_data fsckd;
+        if (!(ubifs_chk_flags & UBIFS_CHK_FS))
+                return 0;
+        fsckd.inodes = RB_ROOT;
+        err = dbg_walk_index(c, check_leaf, NULL, &fsckd);
+        if (err)
+                goto out_free;
+        err = check_inodes(c, &fsckd);
+        if (err)
+                goto out_free;
+        free_inodes(&fsckd);
+        return 0;
+out_free:
+        ubifs_err("file-system check failed with error %d", err);
+        dump_stack();
+        free_inodes(&fsckd);
+        return err;
+}
+static int invocation_cnt;
+int dbg_force_in_the_gaps(void)
+{
+        if (!dbg_force_in_the_gaps_enabled)
+                return 0;
+        /* Force in-the-gaps every 8th commit */
+        return !((invocation_cnt++) & 0x7);
+}
+/* Failure mode for recovery testing */
+#define chance(n, d) (simple_rand() <= (n) * 32768LL / (d))
+struct failure_mode_info {
+        struct list_head list;
+        struct ubifs_info *c;
+};
+static LIST_HEAD(fmi_list);
+static DEFINE_SPINLOCK(fmi_lock);
+static unsigned int next;
+static int simple_rand(void)
+{
+        if (next == 0)
+                next = current->pid;
+        next = next * 1103515245 + 12345;
+        return (next >> 16) & 32767;
+}
+void dbg_failure_mode_registration(struct ubifs_info *c)
+{
+        struct failure_mode_info *fmi;
+        fmi = kmalloc(sizeof(struct failure_mode_info), GFP_NOFS);
+        if (!fmi) {
+                dbg_err("Failed to register failure mode - no memory");
+                return;
+        }
+        fmi->c = c;
+        spin_lock(&fmi_lock);
+        list_add_tail(&fmi->list, &fmi_list);
+        spin_unlock(&fmi_lock);
+}
+void dbg_failure_mode_deregistration(struct ubifs_info *c)
+{
+        struct failure_mode_info *fmi, *tmp;
+        spin_lock(&fmi_lock);
+        list_for_each_entry_safe(fmi, tmp, &fmi_list, list)
+                if (fmi->c == c) {
+                        list_del(&fmi->list);
+                        kfree(fmi);
+                }
+        spin_unlock(&fmi_lock);
+}
+static struct ubifs_info *dbg_find_info(struct ubi_volume_desc *desc)
+{
+        struct failure_mode_info *fmi;
+        spin_lock(&fmi_lock);
+        list_for_each_entry(fmi, &fmi_list, list)
+                if (fmi->c->ubi == desc) {
+                        struct ubifs_info *c = fmi->c;
+                        spin_unlock(&fmi_lock);
+                        return c;
+                }
+        spin_unlock(&fmi_lock);
+        return NULL;
+}
+static int in_failure_mode(struct ubi_volume_desc *desc)
+{
+        struct ubifs_info *c = dbg_find_info(desc);
+        if (c && dbg_failure_mode)
+                return c->failure_mode;
+        return 0;
+}
+static int do_fail(struct ubi_volume_desc *desc, int lnum, int write)
+{
+        struct ubifs_info *c = dbg_find_info(desc);
+        if (!c || !dbg_failure_mode)
+                return 0;
+        if (c->failure_mode)
+                return 1;
+        if (!c->fail_cnt) {
+                /* First call - decide delay to failure */
+                if (chance(1, 2)) {
+                        unsigned int delay = 1 << (simple_rand() >> 11);
+                        if (chance(1, 2)) {
+                                c->fail_delay = 1;
+                                c->fail_timeout = jiffies +
+                                                  msecs_to_jiffies(delay);
+                                dbg_rcvry("failing after %ums", delay);
+                        } else {
+                                c->fail_delay = 2;
+                                c->fail_cnt_max = delay;
+                                dbg_rcvry("failing after %u calls", delay);
+                        }
+                }
+                c->fail_cnt += 1;
+        }
+        /* Determine if failure delay has expired */
+        if (c->fail_delay == 1) {
+                if (time_before(jiffies, c->fail_timeout))
+                        return 0;
+        } else if (c->fail_delay == 2)
+                if (c->fail_cnt++ < c->fail_cnt_max)
+                        return 0;
+        if (lnum == UBIFS_SB_LNUM) {
+                if (write) {
+                        if (chance(1, 2))
+                                return 0;
+                } else if (chance(19, 20))
+                        return 0;
+                dbg_rcvry("failing in super block LEB %d", lnum);
+        } else if (lnum == UBIFS_MST_LNUM || lnum == UBIFS_MST_LNUM + 1) {
+                if (chance(19, 20))
+                        return 0;
+                dbg_rcvry("failing in master LEB %d", lnum);
+        } else if (lnum >= UBIFS_LOG_LNUM && lnum <= c->log_last) {
+                if (write) {
+                        if (chance(99, 100))
+                                return 0;
+                } else if (chance(399, 400))
+                        return 0;
+                dbg_rcvry("failing in log LEB %d", lnum);
+        } else if (lnum >= c->lpt_first && lnum <= c->lpt_last) {
+                if (write) {
+                        if (chance(7, 8))
+                                return 0;
+                } else if (chance(19, 20))
+                        return 0;
+                dbg_rcvry("failing in LPT LEB %d", lnum);
+        } else if (lnum >= c->orph_first && lnum <= c->orph_last) {
+                if (write) {
+                        if (chance(1, 2))
+                                return 0;
+                } else if (chance(9, 10))
+                        return 0;
+                dbg_rcvry("failing in orphan LEB %d", lnum);
+        } else if (lnum == c->ihead_lnum) {
+                if (chance(99, 100))
+                        return 0;
+                dbg_rcvry("failing in index head LEB %d", lnum);
+        } else if (c->jheads && lnum == c->jheads[GCHD].wbuf.lnum) {
+                if (chance(9, 10))
+                        return 0;
+                dbg_rcvry("failing in GC head LEB %d", lnum);
+        } else if (write && !RB_EMPTY_ROOT(&c->buds) &&
+                   !ubifs_search_bud(c, lnum)) {
+                if (chance(19, 20))
+                        return 0;
+                dbg_rcvry("failing in non-bud LEB %d", lnum);
+        } else if (c->cmt_state == COMMIT_RUNNING_BACKGROUND ||
+                   c->cmt_state == COMMIT_RUNNING_REQUIRED) {
+                if (chance(999, 1000))
+                        return 0;
+                dbg_rcvry("failing in bud LEB %d commit running", lnum);
+        } else {
+                if (chance(9999, 10000))
+                        return 0;
+                dbg_rcvry("failing in bud LEB %d commit not running", lnum);
+        }
+        ubifs_err("*** SETTING FAILURE MODE ON (LEB %d) ***", lnum);
+        c->failure_mode = 1;
+        dump_stack();
+        return 1;
+}
+static void cut_data(const void *buf, int len)
+{
+        int flen, i;
+        unsigned char *p = (void *)buf;
+        flen = (len * (long long)simple_rand()) >> 15;
+        for (i = flen; i < len; i++)
+                p[i] = 0xff;
+}
+int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
+                 int len, int check)
+{
+        if (in_failure_mode(desc))
+                return -EIO;
+        return ubi_leb_read(desc, lnum, buf, offset, len, check);
+}
+int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
+                  int offset, int len, int dtype)
+{
+        int err;
+        if (in_failure_mode(desc))
+                return -EIO;
+        if (do_fail(desc, lnum, 1))
+                cut_data(buf, len);
+        err = ubi_leb_write(desc, lnum, buf, offset, len, dtype);
+        if (err)
+                return err;
+        if (in_failure_mode(desc))
+                return -EIO;
+        return 0;
+}
+int dbg_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf,
+                   int len, int dtype)
+{
+        int err;
+        if (do_fail(desc, lnum, 1))
+                return -EIO;
+        err = ubi_leb_change(desc, lnum, buf, len, dtype);
+        if (err)
+                return err;
+        if (do_fail(desc, lnum, 1))
+                return -EIO;
+        return 0;
+}
+int dbg_leb_erase(struct ubi_volume_desc *desc, int lnum)
+{
+        int err;
+        if (do_fail(desc, lnum, 0))
+                return -EIO;
+        err = ubi_leb_erase(desc, lnum);
+        if (err)
+                return err;
+        if (do_fail(desc, lnum, 0))
+                return -EIO;
+        return 0;
+}
+int dbg_leb_unmap(struct ubi_volume_desc *desc, int lnum)
+{
+        int err;
+        if (do_fail(desc, lnum, 0))
+                return -EIO;
+        err = ubi_leb_unmap(desc, lnum);
+        if (err)
+                return err;
+        if (do_fail(desc, lnum, 0))
+                return -EIO;
+        return 0;
+}
+int dbg_is_mapped(struct ubi_volume_desc *desc, int lnum)
+{
+        if (in_failure_mode(desc))
+                return -EIO;
+        return ubi_is_mapped(desc, lnum);
+}
+int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype)
+{
+        int err;
+        if (do_fail(desc, lnum, 0))
+                return -EIO;
+        err = ubi_leb_map(desc, lnum, dtype);
+        if (err)
+                return err;
+        if (do_fail(desc, lnum, 0))
+                return -EIO;
+        return 0;
+}
+#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
new file mode 100644
index 000000000000..3c4f1e93c9e0
--- /dev/null
+++ b/fs/ubifs/debug.h
@@ -0,0 +1,403 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+#ifndef __UBIFS_DEBUG_H__
+#define __UBIFS_DEBUG_H__
+#ifdef CONFIG_UBIFS_FS_DEBUG
+#define UBIFS_DBG(op) op
+#define ubifs_assert(expr)  do {                                               \
+        if (unlikely(!(expr))) {                                               \
+                printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \
+                       __func__, __LINE__, current->pid);                      \
+                dbg_dump_stack();                                              \
+        }                                                                      \
+} while (0)
+#define ubifs_assert_cmt_locked(c) do {                                        \
+        if (unlikely(down_write_trylock(&(c)->commit_sem))) {                  \
+                up_write(&(c)->commit_sem);                                    \
+                printk(KERN_CRIT "commit lock is not locked!\n");              \
+                ubifs_assert(0);                                               \
+        }                                                                      \
+} while (0)
+#define dbg_dump_stack() do {                                                  \
+        if (!dbg_failure_mode)                                                 \
+                dump_stack();                                                  \
+} while (0)
+/* Generic debugging messages */
+#define dbg_msg(fmt, ...) do {                                                 \
+        spin_lock(&dbg_lock);                                                  \
+        printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", current->pid,   \
+               __func__, ##__VA_ARGS__);                                       \
+        spin_unlock(&dbg_lock);                                                \
+} while (0)
+#define dbg_do_msg(typ, fmt, ...) do {                                         \
+        if (ubifs_msg_flags & typ)                                             \
+                dbg_msg(fmt, ##__VA_ARGS__);                                   \
+} while (0)
+#define dbg_err(fmt, ...) do {                                                 \
+        spin_lock(&dbg_lock);                                                  \
+        ubifs_err(fmt, ##__VA_ARGS__);                                         \
+        spin_unlock(&dbg_lock);                                                \
+} while (0)
+const char *dbg_key_str0(const struct ubifs_info *c,
+                         const union ubifs_key *key);
+const char *dbg_key_str1(const struct ubifs_info *c,
+                         const union ubifs_key *key);
+/*
+ * DBGKEY macros require dbg_lock to be held, which it is in the dbg message
+ * macros.
+ */
+#define DBGKEY(key) dbg_key_str0(c, (key))
+#define DBGKEY1(key) dbg_key_str1(c, (key))
+/* General messages */
+#define dbg_gen(fmt, ...)        dbg_do_msg(UBIFS_MSG_GEN, fmt, ##__VA_ARGS__)
+/* Additional journal messages */
+#define dbg_jnl(fmt, ...)        dbg_do_msg(UBIFS_MSG_JNL, fmt, ##__VA_ARGS__)
+/* Additional TNC messages */
+#define dbg_tnc(fmt, ...)        dbg_do_msg(UBIFS_MSG_TNC, fmt, ##__VA_ARGS__)
+/* Additional lprops messages */
+#define dbg_lp(fmt, ...)         dbg_do_msg(UBIFS_MSG_LP, fmt, ##__VA_ARGS__)
+/* Additional LEB find messages */
+#define dbg_find(fmt, ...)       dbg_do_msg(UBIFS_MSG_FIND, fmt, ##__VA_ARGS__)
+/* Additional mount messages */
+#define dbg_mnt(fmt, ...)        dbg_do_msg(UBIFS_MSG_MNT, fmt, ##__VA_ARGS__)
+/* Additional I/O messages */
+#define dbg_io(fmt, ...)         dbg_do_msg(UBIFS_MSG_IO, fmt, ##__VA_ARGS__)
+/* Additional commit messages */
+#define dbg_cmt(fmt, ...)        dbg_do_msg(UBIFS_MSG_CMT, fmt, ##__VA_ARGS__)
+/* Additional budgeting messages */
+#define dbg_budg(fmt, ...)       dbg_do_msg(UBIFS_MSG_BUDG, fmt, ##__VA_ARGS__)
+/* Additional log messages */
+#define dbg_log(fmt, ...)        dbg_do_msg(UBIFS_MSG_LOG, fmt, ##__VA_ARGS__)
+/* Additional gc messages */
+#define dbg_gc(fmt, ...)         dbg_do_msg(UBIFS_MSG_GC, fmt, ##__VA_ARGS__)
+/* Additional scan messages */
+#define dbg_scan(fmt, ...)       dbg_do_msg(UBIFS_MSG_SCAN, fmt, ##__VA_ARGS__)
+/* Additional recovery messages */
+#define dbg_rcvry(fmt, ...)      dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__)
+/*
+ * Debugging message type flags (must match msg_type_names in debug.c).
+ *
+ * UBIFS_MSG_GEN: general messages
+ * UBIFS_MSG_JNL: journal messages
+ * UBIFS_MSG_MNT: mount messages
+ * UBIFS_MSG_CMT: commit messages
+ * UBIFS_MSG_FIND: LEB find messages
+ * UBIFS_MSG_BUDG: budgeting messages
+ * UBIFS_MSG_GC: garbage collection messages
+ * UBIFS_MSG_TNC: TNC messages
+ * UBIFS_MSG_LP: lprops messages
+ * UBIFS_MSG_IO: I/O messages
+ * UBIFS_MSG_LOG: log messages
+ * UBIFS_MSG_SCAN: scan messages
+ * UBIFS_MSG_RCVRY: recovery messages
+ */
+enum {
+        UBIFS_MSG_GEN   = 0x1,
+        UBIFS_MSG_JNL   = 0x2,
+        UBIFS_MSG_MNT   = 0x4,
+        UBIFS_MSG_CMT   = 0x8,
+        UBIFS_MSG_FIND  = 0x10,
+        UBIFS_MSG_BUDG  = 0x20,
+        UBIFS_MSG_GC    = 0x40,
+        UBIFS_MSG_TNC   = 0x80,
+        UBIFS_MSG_LP    = 0x100,
+        UBIFS_MSG_IO    = 0x200,
+        UBIFS_MSG_LOG   = 0x400,
+        UBIFS_MSG_SCAN  = 0x800,
+        UBIFS_MSG_RCVRY = 0x1000,
+};
+/* Debugging message type flags for each default debug message level */
+#define UBIFS_MSG_LVL_0 0
+#define UBIFS_MSG_LVL_1 0x1
+#define UBIFS_MSG_LVL_2 0x7f
+#define UBIFS_MSG_LVL_3 0xffff
+/*
+ * Debugging check flags (must match chk_names in debug.c).
+ *
+ * UBIFS_CHK_GEN: general checks
+ * UBIFS_CHK_TNC: check TNC
+ * UBIFS_CHK_IDX_SZ: check index size
+ * UBIFS_CHK_ORPH: check orphans
+ * UBIFS_CHK_OLD_IDX: check the old index
+ * UBIFS_CHK_LPROPS: check lprops
+ * UBIFS_CHK_FS: check the file-system
+ */
+enum {
+        UBIFS_CHK_GEN     = 0x1,
+        UBIFS_CHK_TNC     = 0x2,
+        UBIFS_CHK_IDX_SZ  = 0x4,
+        UBIFS_CHK_ORPH    = 0x8,
+        UBIFS_CHK_OLD_IDX = 0x10,
+        UBIFS_CHK_LPROPS  = 0x20,
+        UBIFS_CHK_FS      = 0x40,
+};
+/*
+ * Special testing flags (must match tst_names in debug.c).
+ *
+ * UBIFS_TST_FORCE_IN_THE_GAPS: force the use of in-the-gaps method
+ * UBIFS_TST_RCVRY: failure mode for recovery testing
+ */
+enum {
+        UBIFS_TST_FORCE_IN_THE_GAPS = 0x2,
+        UBIFS_TST_RCVRY             = 0x4,
+};
+#if CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 1
+#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_1
+#elif CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 2
+#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_2
+#elif CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 3
+#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_3
+#else
+#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_0
+#endif
+#ifdef CONFIG_UBIFS_FS_DEBUG_CHKS
+#define UBIFS_CHK_FLAGS_DEFAULT 0xffffffff
+#else
+#define UBIFS_CHK_FLAGS_DEFAULT 0
+#endif
+extern spinlock_t dbg_lock;
+extern unsigned int ubifs_msg_flags;
+extern unsigned int ubifs_chk_flags;
+extern unsigned int ubifs_tst_flags;
+/* Dump functions */
+const char *dbg_ntype(int type);
+const char *dbg_cstate(int cmt_state);
+const char *dbg_get_key_dump(const struct ubifs_info *c,
+                             const union ubifs_key *key);
+void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode);
+void dbg_dump_node(const struct ubifs_info *c, const void *node);
+void dbg_dump_budget_req(const struct ubifs_budget_req *req);
+void dbg_dump_lstats(const struct ubifs_lp_stats *lst);
+void dbg_dump_budg(struct ubifs_info *c);
+void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp);
+void dbg_dump_lprops(struct ubifs_info *c);
+void dbg_dump_leb(const struct ubifs_info *c, int lnum);
+void dbg_dump_znode(const struct ubifs_info *c,
+                    const struct ubifs_znode *znode);
+void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat);
+void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
+                    struct ubifs_nnode *parent, int iip);
+void dbg_dump_tnc(struct ubifs_info *c);
+void dbg_dump_index(struct ubifs_info *c);
+/* Checking helper functions */
+typedef int (*dbg_leaf_callback)(struct ubifs_info *c,
+                                 struct ubifs_zbranch *zbr, void *priv);
+typedef int (*dbg_znode_callback)(struct ubifs_info *c,
+                                  struct ubifs_znode *znode, void *priv);
+int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb,
+                   dbg_znode_callback znode_cb, void *priv);
+/* Checking functions */
+int dbg_check_lprops(struct ubifs_info *c);
+int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot);
+int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot);
+int dbg_check_cats(struct ubifs_info *c);
+int dbg_check_ltab(struct ubifs_info *c);
+int dbg_check_synced_i_size(struct inode *inode);
+int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir);
+int dbg_check_tnc(struct ubifs_info *c, int extra);
+int dbg_check_idx_size(struct ubifs_info *c, long long idx_size);
+int dbg_check_filesystem(struct ubifs_info *c);
+void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat,
+                    int add_pos);
+int dbg_check_lprops(struct ubifs_info *c);
+int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode,
+                        int row, int col);
+/* Force the use of in-the-gaps method for testing */
+#define dbg_force_in_the_gaps_enabled \
+        (ubifs_tst_flags & UBIFS_TST_FORCE_IN_THE_GAPS)
+int dbg_force_in_the_gaps(void);
+/* Failure mode for recovery testing */
+#define dbg_failure_mode (ubifs_tst_flags & UBIFS_TST_RCVRY)
+void dbg_failure_mode_registration(struct ubifs_info *c);
+void dbg_failure_mode_deregistration(struct ubifs_info *c);
+#ifndef UBIFS_DBG_PRESERVE_UBI
+#define ubi_leb_read   dbg_leb_read
+#define ubi_leb_write  dbg_leb_write
+#define ubi_leb_change dbg_leb_change
+#define ubi_leb_erase  dbg_leb_erase
+#define ubi_leb_unmap  dbg_leb_unmap
+#define ubi_is_mapped  dbg_is_mapped
+#define ubi_leb_map    dbg_leb_map
+#endif
+int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
+                 int len, int check);
+int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
+                  int offset, int len, int dtype);
+int dbg_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf,
+                   int len, int dtype);
+int dbg_leb_erase(struct ubi_volume_desc *desc, int lnum);
+int dbg_leb_unmap(struct ubi_volume_desc *desc, int lnum);
+int dbg_is_mapped(struct ubi_volume_desc *desc, int lnum);
+int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype);
+static inline int dbg_read(struct ubi_volume_desc *desc, int lnum, char *buf,
+                           int offset, int len)
+{
+        return dbg_leb_read(desc, lnum, buf, offset, len, 0);
+}
+static inline int dbg_write(struct ubi_volume_desc *desc, int lnum,
+                            const void *buf, int offset, int len)
+{
+        return dbg_leb_write(desc, lnum, buf, offset, len, UBI_UNKNOWN);
+}
+static inline int dbg_change(struct ubi_volume_desc *desc, int lnum,
+                                    const void *buf, int len)
+{
+        return dbg_leb_change(desc, lnum, buf, len, UBI_UNKNOWN);
+}
+#else /* !CONFIG_UBIFS_FS_DEBUG */
+#define UBIFS_DBG(op)
+#define ubifs_assert(expr)                         ({})
+#define ubifs_assert_cmt_locked(c)
+#define dbg_dump_stack()
+#define dbg_err(fmt, ...)                          ({})
+#define dbg_msg(fmt, ...)                          ({})
+#define dbg_key(c, key, fmt, ...)                  ({})
+#define dbg_gen(fmt, ...)                          ({})
+#define dbg_jnl(fmt, ...)                          ({})
+#define dbg_tnc(fmt, ...)                          ({})
+#define dbg_lp(fmt, ...)                           ({})
+#define dbg_find(fmt, ...)                         ({})
+#define dbg_mnt(fmt, ...)                          ({})
+#define dbg_io(fmt, ...)                           ({})
+#define dbg_cmt(fmt, ...)                          ({})
+#define dbg_budg(fmt, ...)                         ({})
+#define dbg_log(fmt, ...)                          ({})
+#define dbg_gc(fmt, ...)                           ({})
+#define dbg_scan(fmt, ...)                         ({})
+#define dbg_rcvry(fmt, ...)                        ({})
+#define dbg_ntype(type)                            ""
+#define dbg_cstate(cmt_state)                      ""
+#define dbg_get_key_dump(c, key)                   ({})
+#define dbg_dump_inode(c, inode)                   ({})
+#define dbg_dump_node(c, node)                     ({})
+#define dbg_dump_budget_req(req)                   ({})
+#define dbg_dump_lstats(lst)                       ({})
+#define dbg_dump_budg(c)                           ({})
+#define dbg_dump_lprop(c, lp)                      ({})
+#define dbg_dump_lprops(c)                         ({})
+#define dbg_dump_leb(c, lnum)                      ({})
+#define dbg_dump_znode(c, znode)                   ({})
+#define dbg_dump_heap(c, heap, cat)                ({})
+#define dbg_dump_pnode(c, pnode, parent, iip)      ({})
+#define dbg_dump_tnc(c)                            ({})
+#define dbg_dump_index(c)                          ({})
+#define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0
+#define dbg_old_index_check_init(c, zroot)         0
+#define dbg_check_old_index(c, zroot)              0
+#define dbg_check_cats(c)                          0
+#define dbg_check_ltab(c)                          0
+#define dbg_check_synced_i_size(inode)             0
+#define dbg_check_dir_size(c, dir)                 0
+#define dbg_check_tnc(c, x)                        0
+#define dbg_check_idx_size(c, idx_size)            0
+#define dbg_check_filesystem(c)                    0
+#define dbg_check_heap(c, heap, cat, add_pos)      ({})
+#define dbg_check_lprops(c)                        0
+#define dbg_check_lpt_nodes(c, cnode, row, col)    0
+#define dbg_force_in_the_gaps_enabled              0
+#define dbg_force_in_the_gaps()                    0
+#define dbg_failure_mode                           0
+#define dbg_failure_mode_registration(c)           ({})
+#define dbg_failure_mode_deregistration(c)         ({})
+#endif /* !CONFIG_UBIFS_FS_DEBUG */
+#endif /* !__UBIFS_DEBUG_H__ */
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
new file mode 100644
index 000000000000..e90374be7d3b
--- /dev/null
+++ b/fs/ubifs/dir.c
@@ -0,0 +1,1240 @@
+/* * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ * Copyright (C) 2006, 2007 University of Szeged, Hungary
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ *          Zoltan Sogor
+ */
+/*
+ * This file implements directory operations.
+ *
+ * All FS operations in this file allocate budget before writing anything to the
+ * media. If they fail to allocate it, the error is returned. The only
+ * exceptions are 'ubifs_unlink()' and 'ubifs_rmdir()' which keep working even
+ * if they unable to allocate the budget, because deletion %-ENOSPC failure is
+ * not what users are usually ready to get. UBIFS budgeting subsystem has some
+ * space reserved for these purposes.
+ *
+ * All operations in this file write all inodes which they change straight
+ * away, instead of marking them dirty. For example, 'ubifs_link()' changes
+ * @i_size of the parent inode and writes the parent inode together with the
+ * target inode. This was done to simplify file-system recovery which would
+ * otherwise be very difficult to do. The only exception is rename which marks
+ * the re-named inode dirty (because its @i_ctime is updated) but does not
+ * write it, but just marks it as dirty.
+ */
+#include "ubifs.h"
+/**
+ * inherit_flags - inherit flags of the parent inode.
+ * @dir: parent inode
+ * @mode: new inode mode flags
+ *
+ * This is a helper function for 'ubifs_new_inode()' which inherits flag of the
+ * parent directory inode @dir. UBIFS inodes inherit the following flags:
+ * o %UBIFS_COMPR_FL, which is useful to switch compression on/of on
+ *   sub-directory basis;
+ * o %UBIFS_SYNC_FL - useful for the same reasons;
+ * o %UBIFS_DIRSYNC_FL - similar, but relevant only to directories.
+ *
+ * This function returns the inherited flags.
+ */
+static int inherit_flags(const struct inode *dir, int mode)
+{
+        int flags;
+        const struct ubifs_inode *ui = ubifs_inode(dir);
+        if (!S_ISDIR(dir->i_mode))
+                /*
+                 * The parent is not a directory, which means that an extended
+                 * attribute inode is being created. No flags.
+                 */
+                return 0;
+        flags = ui->flags & (UBIFS_COMPR_FL | UBIFS_SYNC_FL | UBIFS_DIRSYNC_FL);
+        if (!S_ISDIR(mode))
+                /* The "DIRSYNC" flag only applies to directories */
+                flags &= ~UBIFS_DIRSYNC_FL;
+        return flags;
+}
+/**
+ * ubifs_new_inode - allocate new UBIFS inode object.
+ * @c: UBIFS file-system description object
+ * @dir: parent directory inode
+ * @mode: inode mode flags
+ *
+ * This function finds an unused inode number, allocates new inode and
+ * initializes it. Returns new inode in case of success and an error code in
+ * case of failure.
+ */
+struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
+                              int mode)
+{
+        struct inode *inode;
+        struct ubifs_inode *ui;
+        inode = new_inode(c->vfs_sb);
+        ui = ubifs_inode(inode);
+        if (!inode)
+                return ERR_PTR(-ENOMEM);
+        /*
+         * Set 'S_NOCMTIME' to prevent VFS form updating [mc]time of inodes and
+         * marking them dirty in file write path (see 'file_update_time()').
+         * UBIFS has to fully control "clean <-> dirty" transitions of inodes
+         * to make budgeting work.
+         */
+        inode->i_flags |= (S_NOCMTIME);
+        inode->i_uid = current->fsuid;
+        if (dir->i_mode & S_ISGID) {
+                inode->i_gid = dir->i_gid;
+                if (S_ISDIR(mode))
+                        mode |= S_ISGID;
+        } else
+                inode->i_gid = current->fsgid;
+        inode->i_mode = mode;
+        inode->i_mtime = inode->i_atime = inode->i_ctime =
+                         ubifs_current_time(inode);
+        inode->i_mapping->nrpages = 0;
+        /* Disable readahead */
+        inode->i_mapping->backing_dev_info = &c->bdi;
+        switch (mode & S_IFMT) {
+        case S_IFREG:
+                inode->i_mapping->a_ops = &ubifs_file_address_operations;
+                inode->i_op = &ubifs_file_inode_operations;
+                inode->i_fop = &ubifs_file_operations;
+                break;
+        case S_IFDIR:
+                inode->i_op  = &ubifs_dir_inode_operations;
+                inode->i_fop = &ubifs_dir_operations;
+                inode->i_size = ui->ui_size = UBIFS_INO_NODE_SZ;
+                break;
+        case S_IFLNK:
+                inode->i_op = &ubifs_symlink_inode_operations;
+                break;
+        case S_IFSOCK:
+        case S_IFIFO:
+        case S_IFBLK:
+        case S_IFCHR:
+                inode->i_op  = &ubifs_file_inode_operations;
+                break;
+        default:
+                BUG();
+        }
+        ui->flags = inherit_flags(dir, mode);
+        ubifs_set_inode_flags(inode);
+        if (S_ISREG(mode))
+                ui->compr_type = c->default_compr;
+        else
+                ui->compr_type = UBIFS_COMPR_NONE;
+        ui->synced_i_size = 0;
+        spin_lock(&c->cnt_lock);
+        /* Inode number overflow is currently not supported */
+        if (c->highest_inum >= INUM_WARN_WATERMARK) {
+                if (c->highest_inum >= INUM_WATERMARK) {
+                        spin_unlock(&c->cnt_lock);
+                        ubifs_err("out of inode numbers");
+                        make_bad_inode(inode);
+                        iput(inode);
+                        return ERR_PTR(-EINVAL);
+                }
+                ubifs_warn("running out of inode numbers (current %lu, max %d)",
+                           c->highest_inum, INUM_WATERMARK);
+        }
+        inode->i_ino = ++c->highest_inum;
+        inode->i_generation = ++c->vfs_gen;
+        /*
+         * The creation sequence number remains with this inode for its
+         * lifetime. All nodes for this inode have a greater sequence number,
+         * and so it is possible to distinguish obsolete nodes belonging to a
+         * previous incarnation of the same inode number - for example, for the
+         * purpose of rebuilding the index.
+         */
+        ui->creat_sqnum = ++c->max_sqnum;
+        spin_unlock(&c->cnt_lock);
+        return inode;
+}
+#ifdef CONFIG_UBIFS_FS_DEBUG
+static int dbg_check_name(struct ubifs_dent_node *dent, struct qstr *nm)
+{
+        if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+                return 0;
+        if (le16_to_cpu(dent->nlen) != nm->len)
+                return -EINVAL;
+        if (memcmp(dent->name, nm->name, nm->len))
+                return -EINVAL;
+        return 0;
+}
+#else
+#define dbg_check_name(dent, nm) 0
+#endif
+static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
+                                   struct nameidata *nd)
+{
+        int err;
+        union ubifs_key key;
+        struct inode *inode = NULL;
+        struct ubifs_dent_node *dent;
+        struct ubifs_info *c = dir->i_sb->s_fs_info;
+        dbg_gen("'%.*s' in dir ino %lu",
+                dentry->d_name.len, dentry->d_name.name, dir->i_ino);
+        if (dentry->d_name.len > UBIFS_MAX_NLEN)
+                return ERR_PTR(-ENAMETOOLONG);
+        dent = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS);
+        if (!dent)
+                return ERR_PTR(-ENOMEM);
+        dent_key_init(c, &key, dir->i_ino, &dentry->d_name);
+        err = ubifs_tnc_lookup_nm(c, &key, dent, &dentry->d_name);
+        if (err) {
+                /*
+                 * Do not hash the direntry if parent 'i_nlink' is zero, because
+                 * this has side-effects - '->delete_inode()' call will not be
+                 * called for the parent orphan inode, because 'd_count' of its
+                 * direntry will stay 1 (it'll be negative direntry I guess)
+                 * and prevent 'iput_final()' until the dentry is destroyed due
+                 * to unmount or memory pressure.
+                 */
+                if (err == -ENOENT && dir->i_nlink != 0) {
+                        dbg_gen("not found");
+                        goto done;
+                }
+                goto out;
+        }
+        if (dbg_check_name(dent, &dentry->d_name)) {
+                err = -EINVAL;
+                goto out;
+        }
+        inode = ubifs_iget(dir->i_sb, le64_to_cpu(dent->inum));
+        if (IS_ERR(inode)) {
+                /*
+                 * This should not happen. Probably the file-system needs
+                 * checking.
+                 */
+                err = PTR_ERR(inode);
+                ubifs_err("dead directory entry '%.*s', error %d",
+                          dentry->d_name.len, dentry->d_name.name, err);
+                ubifs_ro_mode(c, err);
+                goto out;
+        }
+done:
+        kfree(dent);
+        /*
+         * Note, d_splice_alias() would be required instead if we supported
+         * NFS.
+         */
+        d_add(dentry, inode);
+        return NULL;
+out:
+        kfree(dent);
+        return ERR_PTR(err);
+}
+static int ubifs_create(struct inode *dir, struct dentry *dentry, int mode,
+                        struct nameidata *nd)
+{
+        struct inode *inode;
+        struct ubifs_info *c = dir->i_sb->s_fs_info;
+        int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+        struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
+                                        .dirtied_ino = 1 };
+        struct ubifs_inode *dir_ui = ubifs_inode(dir);
+        /*
+         * Budget request settings: new inode, new direntry, changing the
+         * parent directory inode.
+         */
+        dbg_gen("dent '%.*s', mode %#x in dir ino %lu",
+                dentry->d_name.len, dentry->d_name.name, mode, dir->i_ino);
+        err = ubifs_budget_space(c, &req);
+        if (err)
+                return err;
+        inode = ubifs_new_inode(c, dir, mode);
+        if (IS_ERR(inode)) {
+                err = PTR_ERR(inode);
+                goto out_budg;
+        }
+        mutex_lock(&dir_ui->ui_mutex);
+        dir->i_size += sz_change;
+        dir_ui->ui_size = dir->i_size;
+        dir->i_mtime = dir->i_ctime = inode->i_ctime;
+        err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
+        if (err)
+                goto out_cancel;
+        mutex_unlock(&dir_ui->ui_mutex);
+        ubifs_release_budget(c, &req);
+        insert_inode_hash(inode);
+        d_instantiate(dentry, inode);
+        return 0;
+out_cancel:
+        dir->i_size -= sz_change;
+        dir_ui->ui_size = dir->i_size;
+        mutex_unlock(&dir_ui->ui_mutex);
+        make_bad_inode(inode);
+        iput(inode);
+out_budg:
+        ubifs_release_budget(c, &req);
+        ubifs_err("cannot create regular file, error %d", err);
+        return err;
+}
+/**
+ * vfs_dent_type - get VFS directory entry type.
+ * @type: UBIFS directory entry type
+ *
+ * This function converts UBIFS directory entry type into VFS directory entry
+ * type.
+ */
+static unsigned int vfs_dent_type(uint8_t type)
+{
+        switch (type) {
+        case UBIFS_ITYPE_REG:
+                return DT_REG;
+        case UBIFS_ITYPE_DIR:
+                return DT_DIR;
+        case UBIFS_ITYPE_LNK:
+                return DT_LNK;
+        case UBIFS_ITYPE_BLK:
+                return DT_BLK;
+        case UBIFS_ITYPE_CHR:
+                return DT_CHR;
+        case UBIFS_ITYPE_FIFO:
+                return DT_FIFO;
+        case UBIFS_ITYPE_SOCK:
+                return DT_SOCK;
+        default:
+                BUG();
+        }
+        return 0;
+}
+/*
+ * The classical Unix view for directory is that it is a linear array of
+ * (name, inode number) entries. Linux/VFS assumes this model as well.
+ * Particularly, 'readdir()' call wants us to return a directory entry offset
+ * which later may be used to continue 'readdir()'ing the directory or to
+ * 'seek()' to that specific direntry. Obviously UBIFS does not really fit this
+ * model because directory entries are identified by keys, which may collide.
+ *
+ * UBIFS uses directory entry hash value for directory offsets, so
+ * 'seekdir()'/'telldir()' may not always work because of possible key
+ * collisions. But UBIFS guarantees that consecutive 'readdir()' calls work
+ * properly by means of saving full directory entry name in the private field
+ * of the file description object.
+ *
+ * This means that UBIFS cannot support NFS which requires full
+ * 'seekdir()'/'telldir()' support.
+ */
+static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
+{
+        int err, over = 0;
+        struct qstr nm;
+        union ubifs_key key;
+        struct ubifs_dent_node *dent;
+        struct inode *dir = file->f_path.dentry->d_inode;
+        struct ubifs_info *c = dir->i_sb->s_fs_info;
+        dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, file->f_pos);
+        if (file->f_pos > UBIFS_S_KEY_HASH_MASK || file->f_pos == 2)
+                /*
+                 * The directory was seek'ed to a senseless position or there
+                 * are no more entries.
+                 */
+                return 0;
+        /* File positions 0 and 1 correspond to "." and ".." */
+        if (file->f_pos == 0) {
+                ubifs_assert(!file->private_data);
+                over = filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR);
+                if (over)
+                        return 0;
+                file->f_pos = 1;
+        }
+        if (file->f_pos == 1) {
+                ubifs_assert(!file->private_data);
+                over = filldir(dirent, "..", 2, 1,
+                               parent_ino(file->f_path.dentry), DT_DIR);
+                if (over)
+                        return 0;
+                /* Find the first entry in TNC and save it */
+                lowest_dent_key(c, &key, dir->i_ino);
+                nm.name = NULL;
+                dent = ubifs_tnc_next_ent(c, &key, &nm);
+                if (IS_ERR(dent)) {
+                        err = PTR_ERR(dent);
+                        goto out;
+                }
+                file->f_pos = key_hash_flash(c, &dent->key);
+                file->private_data = dent;
+        }
+        dent = file->private_data;
+        if (!dent) {
+                /*
+                 * The directory was seek'ed to and is now readdir'ed.
+                 * Find the entry corresponding to @file->f_pos or the
+                 * closest one.
+                 */
+                dent_key_init_hash(c, &key, dir->i_ino, file->f_pos);
+                nm.name = NULL;
+                dent = ubifs_tnc_next_ent(c, &key, &nm);
+                if (IS_ERR(dent)) {
+                        err = PTR_ERR(dent);
+                        goto out;
+                }
+                file->f_pos = key_hash_flash(c, &dent->key);
+                file->private_data = dent;
+        }
+        while (1) {
+                dbg_gen("feed '%s', ino %llu, new f_pos %#x",
+                        dent->name, le64_to_cpu(dent->inum),
+                        key_hash_flash(c, &dent->key));
+                ubifs_assert(dent->ch.sqnum > ubifs_inode(dir)->creat_sqnum);
+                nm.len = le16_to_cpu(dent->nlen);
+                over = filldir(dirent, dent->name, nm.len, file->f_pos,
+                               le64_to_cpu(dent->inum),
+                               vfs_dent_type(dent->type));
+                if (over)
+                        return 0;
+                /* Switch to the next entry */
+                key_read(c, &dent->key, &key);
+                nm.name = dent->name;
+                dent = ubifs_tnc_next_ent(c, &key, &nm);
+                if (IS_ERR(dent)) {
+                        err = PTR_ERR(dent);
+                        goto out;
+                }
+                kfree(file->private_data);
+                file->f_pos = key_hash_flash(c, &dent->key);
+                file->private_data = dent;
+                cond_resched();
+        }
+out:
+        if (err != -ENOENT) {
+                ubifs_err("cannot find next direntry, error %d", err);
+                return err;
+        }
+        kfree(file->private_data);
+        file->private_data = NULL;
+        file->f_pos = 2;
+        return 0;
+}
+/* If a directory is seeked, we have to free saved readdir() state */
+static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int origin)
+{
+        kfree(file->private_data);
+        file->private_data = NULL;
+        return generic_file_llseek(file, offset, origin);
+}
+/* Free saved readdir() state when the directory is closed */
+static int ubifs_dir_release(struct inode *dir, struct file *file)
+{
+        kfree(file->private_data);
+        file->private_data = NULL;
+        return 0;
+}
+/**
+ * lock_2_inodes - lock two UBIFS inodes.
+ * @inode1: first inode
+ * @inode2: second inode
+ */
+static void lock_2_inodes(struct inode *inode1, struct inode *inode2)
+{
+        if (inode1->i_ino < inode2->i_ino) {
+                mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_2);
+                mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_3);
+        } else {
+                mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
+                mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_3);
+        }
+}
+/**
+ * unlock_2_inodes - unlock two UBIFS inodes inodes.
+ * @inode1: first inode
+ * @inode2: second inode
+ */
+static void unlock_2_inodes(struct inode *inode1, struct inode *inode2)
+{
+        mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
+        mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
+}
+static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
+                      struct dentry *dentry)
+{
+        struct ubifs_info *c = dir->i_sb->s_fs_info;
+        struct inode *inode = old_dentry->d_inode;
+        struct ubifs_inode *ui = ubifs_inode(inode);
+        struct ubifs_inode *dir_ui = ubifs_inode(dir);
+        int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+        struct ubifs_budget_req req = { .new_dent = 1, .dirtied_ino = 2,
+                                        .dirtied_ino_d = ui->data_len };
+        /*
+         * Budget request settings: new direntry, changing the target inode,
+         * changing the parent inode.
+         */
+        dbg_gen("dent '%.*s' to ino %lu (nlink %d) in dir ino %lu",
+                dentry->d_name.len, dentry->d_name.name, inode->i_ino,
+                inode->i_nlink, dir->i_ino);
+        err = dbg_check_synced_i_size(inode);
+        if (err)
+                return err;
+        err = ubifs_budget_space(c, &req);
+        if (err)
+                return err;
+        lock_2_inodes(dir, inode);
+        inc_nlink(inode);
+        atomic_inc(&inode->i_count);
+        inode->i_ctime = ubifs_current_time(inode);
+        dir->i_size += sz_change;
+        dir_ui->ui_size = dir->i_size;
+        dir->i_mtime = dir->i_ctime = inode->i_ctime;
+        err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
+        if (err)
+                goto out_cancel;
+        unlock_2_inodes(dir, inode);
+        ubifs_release_budget(c, &req);
+        d_instantiate(dentry, inode);
+        return 0;
+out_cancel:
+        dir->i_size -= sz_change;
+        dir_ui->ui_size = dir->i_size;
+        drop_nlink(inode);
+        unlock_2_inodes(dir, inode);
+        ubifs_release_budget(c, &req);
+        iput(inode);
+        return err;
+}
+static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
+{
+        struct ubifs_info *c = dir->i_sb->s_fs_info;
+        struct inode *inode = dentry->d_inode;
+        struct ubifs_inode *dir_ui = ubifs_inode(dir);
+        int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+        int err, budgeted = 1;
+        struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
+        /*
+         * Budget request settings: deletion direntry, deletion inode (+1 for
+         * @dirtied_ino), changing the parent directory inode. If budgeting
+         * fails, go ahead anyway because we have extra space reserved for
+         * deletions.
+         */
+        dbg_gen("dent '%.*s' from ino %lu (nlink %d) in dir ino %lu",
+                dentry->d_name.len, dentry->d_name.name, inode->i_ino,
+                inode->i_nlink, dir->i_ino);
+        err = dbg_check_synced_i_size(inode);
+        if (err)
+                return err;
+        err = ubifs_budget_space(c, &req);
+        if (err) {
+                if (err != -ENOSPC)
+                        return err;
+                err = 0;
+                budgeted = 0;
+        }
+        lock_2_inodes(dir, inode);
+        inode->i_ctime = ubifs_current_time(dir);
+        drop_nlink(inode);
+        dir->i_size -= sz_change;
+        dir_ui->ui_size = dir->i_size;
+        dir->i_mtime = dir->i_ctime = inode->i_ctime;
+        err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0);
+        if (err)
+                goto out_cancel;
+        unlock_2_inodes(dir, inode);
+        if (budgeted)
+                ubifs_release_budget(c, &req);
+        else {
+                /* We've deleted something - clean the "no space" flags */
+                c->nospace = c->nospace_rp = 0;
+                smp_wmb();
+        }
+        return 0;
+out_cancel:
+        dir->i_size += sz_change;
+        dir_ui->ui_size = dir->i_size;
+        inc_nlink(inode);
+        unlock_2_inodes(dir, inode);
+        if (budgeted)
+                ubifs_release_budget(c, &req);
+        return err;
+}
+/**
+ * check_dir_empty - check if a directory is empty or not.
+ * @c: UBIFS file-system description object
+ * @dir: VFS inode object of the directory to check
+ *
+ * This function checks if directory @dir is empty. Returns zero if the
+ * directory is empty, %-ENOTEMPTY if it is not, and other negative error codes
+ * in case of of errors.
+ */
+static int check_dir_empty(struct ubifs_info *c, struct inode *dir)
+{
+        struct qstr nm = { .name = NULL };
+        struct ubifs_dent_node *dent;
+        union ubifs_key key;
+        int err;
+        lowest_dent_key(c, &key, dir->i_ino);
+        dent = ubifs_tnc_next_ent(c, &key, &nm);
+        if (IS_ERR(dent)) {
+                err = PTR_ERR(dent);
+                if (err == -ENOENT)
+                        err = 0;
+        } else {
+                kfree(dent);
+                err = -ENOTEMPTY;
+        }
+        return err;
+}
+static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+        struct ubifs_info *c = dir->i_sb->s_fs_info;
+        struct inode *inode = dentry->d_inode;
+        int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+        int err, budgeted = 1;
+        struct ubifs_inode *dir_ui = ubifs_inode(dir);
+        struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
+        /*
+         * Budget request settings: deletion direntry, deletion inode and
+         * changing the parent inode. If budgeting fails, go ahead anyway
+         * because we have extra space reserved for deletions.
+         */
+        dbg_gen("directory '%.*s', ino %lu in dir ino %lu", dentry->d_name.len,
+                dentry->d_name.name, inode->i_ino, dir->i_ino);
+        err = check_dir_empty(c, dentry->d_inode);
+        if (err)
+                return err;
+        err = ubifs_budget_space(c, &req);
+        if (err) {
+                if (err != -ENOSPC)
+                        return err;
+                budgeted = 0;
+        }
+        lock_2_inodes(dir, inode);
+        inode->i_ctime = ubifs_current_time(dir);
+        clear_nlink(inode);
+        drop_nlink(dir);
+        dir->i_size -= sz_change;
+        dir_ui->ui_size = dir->i_size;
+        dir->i_mtime = dir->i_ctime = inode->i_ctime;
+        err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0);
+        if (err)
+                goto out_cancel;
+        unlock_2_inodes(dir, inode);
+        if (budgeted)
+                ubifs_release_budget(c, &req);
+        else {
+                /* We've deleted something - clean the "no space" flags */
+                c->nospace = c->nospace_rp = 0;
+                smp_wmb();
+        }
+        return 0;
+out_cancel:
+        dir->i_size += sz_change;
+        dir_ui->ui_size = dir->i_size;
+        inc_nlink(dir);
+        inc_nlink(inode);
+        inc_nlink(inode);
+        unlock_2_inodes(dir, inode);
+        if (budgeted)
+                ubifs_release_budget(c, &req);
+        return err;
+}
+static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+        struct inode *inode;
+        struct ubifs_inode *dir_ui = ubifs_inode(dir);
+        struct ubifs_info *c = dir->i_sb->s_fs_info;
+        int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+        struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
+                                        .dirtied_ino_d = 1 };
+        /*
+         * Budget request settings: new inode, new direntry and changing parent
+         * directory inode.
+         */
+        dbg_gen("dent '%.*s', mode %#x in dir ino %lu",
+                dentry->d_name.len, dentry->d_name.name, mode, dir->i_ino);
+        err = ubifs_budget_space(c, &req);
+        if (err)
+                return err;
+        inode = ubifs_new_inode(c, dir, S_IFDIR | mode);
+        if (IS_ERR(inode)) {
+                err = PTR_ERR(inode);
+                goto out_budg;
+        }
+        mutex_lock(&dir_ui->ui_mutex);
+        insert_inode_hash(inode);
+        inc_nlink(inode);
+        inc_nlink(dir);
+        dir->i_size += sz_change;
+        dir_ui->ui_size = dir->i_size;
+        dir->i_mtime = dir->i_ctime = inode->i_ctime;
+        err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
+        if (err) {
+                ubifs_err("cannot create directory, error %d", err);
+                goto out_cancel;
+        }
+        mutex_unlock(&dir_ui->ui_mutex);
+        ubifs_release_budget(c, &req);
+        d_instantiate(dentry, inode);
+        return 0;
+out_cancel:
+        dir->i_size -= sz_change;
+        dir_ui->ui_size = dir->i_size;
+        drop_nlink(dir);
+        mutex_unlock(&dir_ui->ui_mutex);
+        make_bad_inode(inode);
+        iput(inode);
+out_budg:
+        ubifs_release_budget(c, &req);
+        return err;
+}
+static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
+                       int mode, dev_t rdev)
+{
+        struct inode *inode;
+        struct ubifs_inode *ui;
+        struct ubifs_inode *dir_ui = ubifs_inode(dir);
+        struct ubifs_info *c = dir->i_sb->s_fs_info;
+        union ubifs_dev_desc *dev = NULL;
+        int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+        int err, devlen = 0;
+        struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
+                                        .new_ino_d = devlen, .dirtied_ino = 1 };
+        /*
+         * Budget request settings: new inode, new direntry and changing parent
+         * directory inode.
+         */
+        dbg_gen("dent '%.*s' in dir ino %lu",
+                dentry->d_name.len, dentry->d_name.name, dir->i_ino);
+        if (!new_valid_dev(rdev))
+                return -EINVAL;
+        if (S_ISBLK(mode) || S_ISCHR(mode)) {
+                dev = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS);
+                if (!dev)
+                        return -ENOMEM;
+                devlen = ubifs_encode_dev(dev, rdev);
+        }
+        err = ubifs_budget_space(c, &req);
+        if (err) {
+                kfree(dev);
+                return err;
+        }
+        inode = ubifs_new_inode(c, dir, mode);
+        if (IS_ERR(inode)) {
+                kfree(dev);
+                err = PTR_ERR(inode);
+                goto out_budg;
+        }
+        init_special_inode(inode, inode->i_mode, rdev);
+        inode->i_size = ubifs_inode(inode)->ui_size = devlen;
+        ui = ubifs_inode(inode);
+        ui->data = dev;
+        ui->data_len = devlen;
+        mutex_lock(&dir_ui->ui_mutex);
+        dir->i_size += sz_change;
+        dir_ui->ui_size = dir->i_size;
+        dir->i_mtime = dir->i_ctime = inode->i_ctime;
+        err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
+        if (err)
+                goto out_cancel;
+        mutex_unlock(&dir_ui->ui_mutex);
+        ubifs_release_budget(c, &req);
+        insert_inode_hash(inode);
+        d_instantiate(dentry, inode);
+        return 0;
+out_cancel:
+        dir->i_size -= sz_change;
+        dir_ui->ui_size = dir->i_size;
+        mutex_unlock(&dir_ui->ui_mutex);
+        make_bad_inode(inode);
+        iput(inode);
+out_budg:
+        ubifs_release_budget(c, &req);
+        return err;
+}
+static int ubifs_symlink(struct inode *dir, struct dentry *dentry,
+                         const char *symname)
+{
+        struct inode *inode;
+        struct ubifs_inode *ui;
+        struct ubifs_inode *dir_ui = ubifs_inode(dir);
+        struct ubifs_info *c = dir->i_sb->s_fs_info;
+        int err, len = strlen(symname);
+        int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+        struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
+                                        .new_ino_d = len, .dirtied_ino = 1 };
+        /*
+         * Budget request settings: new inode, new direntry and changing parent
+         * directory inode.
+         */
+        dbg_gen("dent '%.*s', target '%s' in dir ino %lu", dentry->d_name.len,
+                dentry->d_name.name, symname, dir->i_ino);
+        if (len > UBIFS_MAX_INO_DATA)
+                return -ENAMETOOLONG;
+        err = ubifs_budget_space(c, &req);
+        if (err)
+                return err;
+        inode = ubifs_new_inode(c, dir, S_IFLNK | S_IRWXUGO);
+        if (IS_ERR(inode)) {
+                err = PTR_ERR(inode);
+                goto out_budg;
+        }
+        ui = ubifs_inode(inode);
+        ui->data = kmalloc(len + 1, GFP_NOFS);
+        if (!ui->data) {
+                err = -ENOMEM;
+                goto out_inode;
+        }
+        memcpy(ui->data, symname, len);
+        ((char *)ui->data)[len] = '\0';
+        /*
+         * The terminating zero byte is not written to the flash media and it
+         * is put just to make later in-memory string processing simpler. Thus,
+         * data length is @len, not @len + %1.
+         */
+        ui->data_len = len;
+        inode->i_size = ubifs_inode(inode)->ui_size = len;
+        mutex_lock(&dir_ui->ui_mutex);
+        dir->i_size += sz_change;
+        dir_ui->ui_size = dir->i_size;
+        dir->i_mtime = dir->i_ctime = inode->i_ctime;
+        err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
+        if (err)
+                goto out_cancel;
+        mutex_unlock(&dir_ui->ui_mutex);
+        ubifs_release_budget(c, &req);
+        insert_inode_hash(inode);
+        d_instantiate(dentry, inode);
+        return 0;
+out_cancel:
+        dir->i_size -= sz_change;
+        dir_ui->ui_size = dir->i_size;
+        mutex_unlock(&dir_ui->ui_mutex);
+out_inode:
+        make_bad_inode(inode);
+        iput(inode);
+out_budg:
+        ubifs_release_budget(c, &req);
+        return err;
+}
+/**
+ * lock_3_inodes - lock three UBIFS inodes for rename.
+ * @inode1: first inode
+ * @inode2: second inode
+ * @inode3: third inode
+ *
+ * For 'ubifs_rename()', @inode1 may be the same as @inode2 whereas @inode3 may
+ * be null.
+ */
+static void lock_3_inodes(struct inode *inode1, struct inode *inode2,
+                          struct inode *inode3)
+{
+        struct inode *i1, *i2, *i3;
+        if (!inode3) {
+                if (inode1 != inode2) {
+                        lock_2_inodes(inode1, inode2);
+                        return;
+                }
+                mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
+                return;
+        }
+        if (inode1 == inode2) {
+                lock_2_inodes(inode1, inode3);
+                return;
+        }
+        /* 3 different inodes */
+        if (inode1 < inode2) {
+                i3 = inode2;
+                if (inode1 < inode3) {
+                        i1 = inode1;
+                        i2 = inode3;
+                } else {
+                        i1 = inode3;
+                        i2 = inode1;
+                }
+        } else {
+                i3 = inode1;
+                if (inode2 < inode3) {
+                        i1 = inode2;
+                        i2 = inode3;
+                } else {
+                        i1 = inode3;
+                        i2 = inode2;
+                }
+        }
+        mutex_lock_nested(&ubifs_inode(i1)->ui_mutex, WB_MUTEX_1);
+        lock_2_inodes(i2, i3);
+}
+/**
+ * unlock_3_inodes - unlock three UBIFS inodes for rename.
+ * @inode1: first inode
+ * @inode2: second inode
+ * @inode3: third inode
+ */
+static void unlock_3_inodes(struct inode *inode1, struct inode *inode2,
+                            struct inode *inode3)
+{
+        mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
+        if (inode1 != inode2)
+                mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
+        if (inode3)
+                mutex_unlock(&ubifs_inode(inode3)->ui_mutex);
+}
+static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
+                        struct inode *new_dir, struct dentry *new_dentry)
+{
+        struct ubifs_info *c = old_dir->i_sb->s_fs_info;
+        struct inode *old_inode = old_dentry->d_inode;
+        struct inode *new_inode = new_dentry->d_inode;
+        struct ubifs_inode *old_inode_ui = ubifs_inode(old_inode);
+        int err, release, sync = 0, move = (new_dir != old_dir);
+        int is_dir = S_ISDIR(old_inode->i_mode);
+        int unlink = !!new_inode;
+        int new_sz = CALC_DENT_SIZE(new_dentry->d_name.len);
+        int old_sz = CALC_DENT_SIZE(old_dentry->d_name.len);
+        struct ubifs_budget_req req = { .new_dent = 1, .mod_dent = 1,
+                                        .dirtied_ino = 3 };
+        struct ubifs_budget_req ino_req = { .dirtied_ino = 1,
+                                .dirtied_ino_d = old_inode_ui->data_len };
+        struct timespec time;
+        /*
+         * Budget request settings: deletion direntry, new direntry, removing
+         * the old inode, and changing old and new parent directory inodes.
+         *
+         * However, this operation also marks the target inode as dirty and
+         * does not write it, so we allocate budget for the target inode
+         * separately.
+         */
+        dbg_gen("dent '%.*s' ino %lu in dir ino %lu to dent '%.*s' in "
+                "dir ino %lu", old_dentry->d_name.len, old_dentry->d_name.name,
+                old_inode->i_ino, old_dir->i_ino, new_dentry->d_name.len,
+                new_dentry->d_name.name, new_dir->i_ino);
+        if (unlink && is_dir) {
+                err = check_dir_empty(c, new_inode);
+                if (err)
+                        return err;
+        }
+        err = ubifs_budget_space(c, &req);
+        if (err)
+                return err;
+        err = ubifs_budget_space(c, &ino_req);
+        if (err) {
+                ubifs_release_budget(c, &req);
+                return err;
+        }
+        lock_3_inodes(old_dir, new_dir, new_inode);
+        /*
+         * Like most other Unix systems, set the @i_ctime for inodes on a
+         * rename.
+         */
+        time = ubifs_current_time(old_dir);
+        old_inode->i_ctime = time;
+        /* We must adjust parent link count when renaming directories */
+        if (is_dir) {
+                if (move) {
+                        /*
+                         * @old_dir loses a link because we are moving
+                         * @old_inode to a different directory.
+                         */
+                        drop_nlink(old_dir);
+                        /*
+                         * @new_dir only gains a link if we are not also
+                         * overwriting an existing directory.
+                         */
+                        if (!unlink)
+                                inc_nlink(new_dir);
+                } else {
+                        /*
+                         * @old_inode is not moving to a different directory,
+                         * but @old_dir still loses a link if we are
+                         * overwriting an existing directory.
+                         */
+                        if (unlink)
+                                drop_nlink(old_dir);
+                }
+        }
+        old_dir->i_size -= old_sz;
+        ubifs_inode(old_dir)->ui_size = old_dir->i_size;
+        old_dir->i_mtime = old_dir->i_ctime = time;
+        new_dir->i_mtime = new_dir->i_ctime = time;
+        /*
+         * And finally, if we unlinked a direntry which happened to have the
+         * same name as the moved direntry, we have to decrement @i_nlink of
+         * the unlinked inode and change its ctime.
+         */
+        if (unlink) {
+                /*
+                 * Directories cannot have hard-links, so if this is a
+                 * directory, decrement its @i_nlink twice because an empty
+                 * directory has @i_nlink 2.
+                 */
+                if (is_dir)
+                        drop_nlink(new_inode);
+                new_inode->i_ctime = time;
+                drop_nlink(new_inode);
+        } else {
+                new_dir->i_size += new_sz;
+                ubifs_inode(new_dir)->ui_size = new_dir->i_size;
+        }
+        /*
+         * Do not ask 'ubifs_jnl_rename()' to flush write-buffer if @old_inode
+         * is dirty, because this will be done later on at the end of
+         * 'ubifs_rename()'.
+         */
+        if (IS_SYNC(old_inode)) {
+                sync = IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir);
+                if (unlink && IS_SYNC(new_inode))
+                        sync = 1;
+        }
+        err = ubifs_jnl_rename(c, old_dir, old_dentry, new_dir, new_dentry,
+                               sync);
+        if (err)
+                goto out_cancel;
+        unlock_3_inodes(old_dir, new_dir, new_inode);
+        ubifs_release_budget(c, &req);
+        mutex_lock(&old_inode_ui->ui_mutex);
+        release = old_inode_ui->dirty;
+        mark_inode_dirty_sync(old_inode);
+        mutex_unlock(&old_inode_ui->ui_mutex);
+        if (release)
+                ubifs_release_budget(c, &ino_req);
+        if (IS_SYNC(old_inode))
+                err = old_inode->i_sb->s_op->write_inode(old_inode, 1);
+        return err;
+out_cancel:
+        if (unlink) {
+                if (is_dir)
+                        inc_nlink(new_inode);
+                inc_nlink(new_inode);
+        } else {
+                new_dir->i_size -= new_sz;
+                ubifs_inode(new_dir)->ui_size = new_dir->i_size;
+        }
+        old_dir->i_size += old_sz;
+        ubifs_inode(old_dir)->ui_size = old_dir->i_size;
+        if (is_dir) {
+                if (move) {
+                        inc_nlink(old_dir);
+                        if (!unlink)
+                                drop_nlink(new_dir);
+                } else {
+                        if (unlink)
+                                inc_nlink(old_dir);
+                }
+        }
+        unlock_3_inodes(old_dir, new_dir, new_inode);
+        ubifs_release_budget(c, &ino_req);
+        ubifs_release_budget(c, &req);
+        return err;
+}
+int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+                  struct kstat *stat)
+{
+        loff_t size;
+        struct inode *inode = dentry->d_inode;
+        struct ubifs_inode *ui = ubifs_inode(inode);
+        mutex_lock(&ui->ui_mutex);
+        stat->dev = inode->i_sb->s_dev;
+        stat->ino = inode->i_ino;
+        stat->mode = inode->i_mode;
+        stat->nlink = inode->i_nlink;
+        stat->uid = inode->i_uid;
+        stat->gid = inode->i_gid;
+        stat->rdev = inode->i_rdev;
+        stat->atime = inode->i_atime;
+        stat->mtime = inode->i_mtime;
+        stat->ctime = inode->i_ctime;
+        stat->blksize = UBIFS_BLOCK_SIZE;
+        stat->size = ui->ui_size;
+        /*
+         * Unfortunately, the 'stat()' system call was designed for block
+         * device based file systems, and it is not appropriate for UBIFS,
+         * because UBIFS does not have notion of "block". For example, it is
+         * difficult to tell how many block a directory takes - it actually
+         * takes less than 300 bytes, but we have to round it to block size,
+         * which introduces large mistake. This makes utilities like 'du' to
+         * report completely senseless numbers. This is the reason why UBIFS
+         * goes the same way as JFFS2 - it reports zero blocks for everything
+         * but regular files, which makes more sense than reporting completely
+         * wrong sizes.
+         */
+        if (S_ISREG(inode->i_mode)) {
+                size = ui->xattr_size;
+                size += stat->size;
+                size = ALIGN(size, UBIFS_BLOCK_SIZE);
+                /*
+                 * Note, user-space expects 512-byte blocks count irrespectively
+                 * of what was reported in @stat->size.
+                 */
+                stat->blocks = size >> 9;
+        } else
+                stat->blocks = 0;
+        mutex_unlock(&ui->ui_mutex);
+        return 0;
+}
+struct inode_operations ubifs_dir_inode_operations = {
+        .lookup      = ubifs_lookup,
+        .create      = ubifs_create,
+        .link        = ubifs_link,
+        .symlink     = ubifs_symlink,
+        .unlink      = ubifs_unlink,
+        .mkdir       = ubifs_mkdir,
+        .rmdir       = ubifs_rmdir,
+        .mknod       = ubifs_mknod,
+        .rename      = ubifs_rename,
+        .setattr     = ubifs_setattr,
+        .getattr     = ubifs_getattr,
+#ifdef CONFIG_UBIFS_FS_XATTR
+        .setxattr    = ubifs_setxattr,
+        .getxattr    = ubifs_getxattr,
+        .listxattr   = ubifs_listxattr,
+        .removexattr = ubifs_removexattr,
+#endif
+};
+struct file_operations ubifs_dir_operations = {
+        .llseek         = ubifs_dir_llseek,
+        .release        = ubifs_dir_release,
+        .read           = generic_read_dir,
+        .readdir        = ubifs_readdir,
+        .fsync          = ubifs_fsync,
+        .unlocked_ioctl = ubifs_ioctl,
+#ifdef CONFIG_COMPAT
+        .compat_ioctl   = ubifs_compat_ioctl,
+#endif
+};
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
new file mode 100644
index 000000000000..005a3b854d96
--- /dev/null
+++ b/fs/ubifs/file.c
@@ -0,0 +1,1275 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+/*
+ * This file implements VFS file and inode operations of regular files, device
+ * nodes and symlinks as well as address space operations.
+ *
+ * UBIFS uses 2 page flags: PG_private and PG_checked. PG_private is set if the
+ * page is dirty and is used for budgeting purposes - dirty pages should not be
+ * budgeted. The PG_checked flag is set if full budgeting is required for the
+ * page e.g., when it corresponds to a file hole or it is just beyond the file
+ * size. The budgeting is done in 'ubifs_write_begin()', because it is OK to
+ * fail in this function, and the budget is released in 'ubifs_write_end()'. So
+ * the PG_private and PG_checked flags carry the information about how the page
+ * was budgeted, to make it possible to release the budget properly.
+ *
+ * A thing to keep in mind: inode's 'i_mutex' is locked in most VFS operations
+ * we implement. However, this is not true for '->writepage()', which might be
+ * called with 'i_mutex' unlocked. For example, when pdflush is performing
+ * write-back, it calls 'writepage()' with unlocked 'i_mutex', although the
+ * inode has 'I_LOCK' flag in this case. At "normal" work-paths 'i_mutex' is
+ * locked in '->writepage', e.g. in "sys_write -> alloc_pages -> direct reclaim
+ * path'. So, in '->writepage()' we are only guaranteed that the page is
+ * locked.
+ *
+ * Similarly, 'i_mutex' does not have to be locked in readpage(), e.g.,
+ * readahead path does not have it locked ("sys_read -> generic_file_aio_read
+ * -> ondemand_readahead -> readpage"). In case of readahead, 'I_LOCK' flag is
+ * not set as well. However, UBIFS disables readahead.
+ *
+ * This, for example means that there might be 2 concurrent '->writepage()'
+ * calls for the same inode, but different inode dirty pages.
+ */
+#include "ubifs.h"
+#include <linux/mount.h>
+static int read_block(struct inode *inode, void *addr, unsigned int block,
+                      struct ubifs_data_node *dn)
+{
+        struct ubifs_info *c = inode->i_sb->s_fs_info;
+        int err, len, out_len;
+        union ubifs_key key;
+        unsigned int dlen;
+        data_key_init(c, &key, inode->i_ino, block);
+        err = ubifs_tnc_lookup(c, &key, dn);
+        if (err) {
+                if (err == -ENOENT)
+                        /* Not found, so it must be a hole */
+                        memset(addr, 0, UBIFS_BLOCK_SIZE);
+                return err;
+        }
+        ubifs_assert(dn->ch.sqnum > ubifs_inode(inode)->creat_sqnum);
+        len = le32_to_cpu(dn->size);
+        if (len <= 0 || len > UBIFS_BLOCK_SIZE)
+                goto dump;
+        dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
+        out_len = UBIFS_BLOCK_SIZE;
+        err = ubifs_decompress(&dn->data, dlen, addr, &out_len,
+                               le16_to_cpu(dn->compr_type));
+        if (err || len != out_len)
+                goto dump;
+        /*
+         * Data length can be less than a full block, even for blocks that are
+         * not the last in the file (e.g., as a result of making a hole and
+         * appending data). Ensure that the remainder is zeroed out.
+         */
+        if (len < UBIFS_BLOCK_SIZE)
+                memset(addr + len, 0, UBIFS_BLOCK_SIZE - len);
+        return 0;
+dump:
+        ubifs_err("bad data node (block %u, inode %lu)",
+                  block, inode->i_ino);
+        dbg_dump_node(c, dn);
+        return -EINVAL;
+}
+static int do_readpage(struct page *page)
+{
+        void *addr;
+        int err = 0, i;
+        unsigned int block, beyond;
+        struct ubifs_data_node *dn;
+        struct inode *inode = page->mapping->host;
+        loff_t i_size = i_size_read(inode);
+        dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx",
+                inode->i_ino, page->index, i_size, page->flags);
+        ubifs_assert(!PageChecked(page));
+        ubifs_assert(!PagePrivate(page));
+        addr = kmap(page);
+        block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT;
+        beyond = (i_size + UBIFS_BLOCK_SIZE - 1) >> UBIFS_BLOCK_SHIFT;
+        if (block >= beyond) {
+                /* Reading beyond inode */
+                SetPageChecked(page);
+                memset(addr, 0, PAGE_CACHE_SIZE);
+                goto out;
+        }
+        dn = kmalloc(UBIFS_MAX_DATA_NODE_SZ, GFP_NOFS);
+        if (!dn) {
+                err = -ENOMEM;
+                goto error;
+        }
+        i = 0;
+        while (1) {
+                int ret;
+                if (block >= beyond) {
+                        /* Reading beyond inode */
+                        err = -ENOENT;
+                        memset(addr, 0, UBIFS_BLOCK_SIZE);
+                } else {
+                        ret = read_block(inode, addr, block, dn);
+                        if (ret) {
+                                err = ret;
+                                if (err != -ENOENT)
+                                        break;
+                        }
+                }
+                if (++i >= UBIFS_BLOCKS_PER_PAGE)
+                        break;
+                block += 1;
+                addr += UBIFS_BLOCK_SIZE;
+        }
+        if (err) {
+                if (err == -ENOENT) {
+                        /* Not found, so it must be a hole */
+                        SetPageChecked(page);
+                        dbg_gen("hole");
+                        goto out_free;
+                }
+                ubifs_err("cannot read page %lu of inode %lu, error %d",
+                          page->index, inode->i_ino, err);
+                goto error;
+        }
+out_free:
+        kfree(dn);
+out:
+        SetPageUptodate(page);
+        ClearPageError(page);
+        flush_dcache_page(page);
+        kunmap(page);
+        return 0;
+error:
+        kfree(dn);
+        ClearPageUptodate(page);
+        SetPageError(page);
+        flush_dcache_page(page);
+        kunmap(page);
+        return err;
+}
+/**
+ * release_new_page_budget - release budget of a new page.
+ * @c: UBIFS file-system description object
+ *
+ * This is a helper function which releases budget corresponding to the budget
+ * of one new page of data.
+ */
+static void release_new_page_budget(struct ubifs_info *c)
+{
+        struct ubifs_budget_req req = { .recalculate = 1, .new_page = 1 };
+        ubifs_release_budget(c, &req);
+}
+/**
+ * release_existing_page_budget - release budget of an existing page.
+ * @c: UBIFS file-system description object
+ *
+ * This is a helper function which releases budget corresponding to the budget
+ * of changing one one page of data which already exists on the flash media.
+ */
+static void release_existing_page_budget(struct ubifs_info *c)
+{
+        struct ubifs_budget_req req = { .dd_growth = c->page_budget};
+        ubifs_release_budget(c, &req);
+}
+static int write_begin_slow(struct address_space *mapping,
+                            loff_t pos, unsigned len, struct page **pagep)
+{
+        struct inode *inode = mapping->host;
+        struct ubifs_info *c = inode->i_sb->s_fs_info;
+        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+        struct ubifs_budget_req req = { .new_page = 1 };
+        int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
+        struct page *page;
+        dbg_gen("ino %lu, pos %llu, len %u, i_size %lld",
+                inode->i_ino, pos, len, inode->i_size);
+        /*
+         * At the slow path we have to budget before locking the page, because
+         * budgeting may force write-back, which would wait on locked pages and
+         * deadlock if we had the page locked. At this point we do not know
+         * anything about the page, so assume that this is a new page which is
+         * written to a hole. This corresponds to largest budget. Later the
+         * budget will be amended if this is not true.
+         */
+        if (appending)
+                /* We are appending data, budget for inode change */
+                req.dirtied_ino = 1;
+        err = ubifs_budget_space(c, &req);
+        if (unlikely(err))
+                return err;
+        page = __grab_cache_page(mapping, index);
+        if (unlikely(!page)) {
+                ubifs_release_budget(c, &req);
+                return -ENOMEM;
+        }
+        if (!PageUptodate(page)) {
+                if (!(pos & PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
+                        SetPageChecked(page);
+                else {
+                        err = do_readpage(page);
+                        if (err) {
+                                unlock_page(page);
+                                page_cache_release(page);
+                                return err;
+                        }
+                }
+                SetPageUptodate(page);
+                ClearPageError(page);
+        }
+        if (PagePrivate(page))
+                /*
+                 * The page is dirty, which means it was budgeted twice:
+                 *   o first time the budget was allocated by the task which
+                 *     made the page dirty and set the PG_private flag;
+                 *   o and then we budgeted for it for the second time at the
+                 *     very beginning of this function.
+                 *
+                 * So what we have to do is to release the page budget we
+                 * allocated.
+                 */
+                release_new_page_budget(c);
+        else if (!PageChecked(page))
+                /*
+                 * We are changing a page which already exists on the media.
+                 * This means that changing the page does not make the amount
+                 * of indexing information larger, and this part of the budget
+                 * which we have already acquired may be released.
+                 */
+                ubifs_convert_page_budget(c);
+        if (appending) {
+                struct ubifs_inode *ui = ubifs_inode(inode);
+                /*
+                 * 'ubifs_write_end()' is optimized from the fast-path part of
+                 * 'ubifs_write_begin()' and expects the @ui_mutex to be locked
+                 * if data is appended.
+                 */
+                mutex_lock(&ui->ui_mutex);
+                if (ui->dirty)
+                        /*
+                         * The inode is dirty already, so we may free the
+                         * budget we allocated.
+                         */
+                        ubifs_release_dirty_inode_budget(c, ui);
+        }
+        *pagep = page;
+        return 0;
+}
+/**
+ * allocate_budget - allocate budget for 'ubifs_write_begin()'.
+ * @c: UBIFS file-system description object
+ * @page: page to allocate budget for
+ * @ui: UBIFS inode object the page belongs to
+ * @appending: non-zero if the page is appended
+ *
+ * This is a helper function for 'ubifs_write_begin()' which allocates budget
+ * for the operation. The budget is allocated differently depending on whether
+ * this is appending, whether the page is dirty or not, and so on. This
+ * function leaves the @ui->ui_mutex locked in case of appending. Returns zero
+ * in case of success and %-ENOSPC in case of failure.
+ */
+static int allocate_budget(struct ubifs_info *c, struct page *page,
+                           struct ubifs_inode *ui, int appending)
+{
+        struct ubifs_budget_req req = { .fast = 1 };
+        if (PagePrivate(page)) {
+                if (!appending)
+                        /*
+                         * The page is dirty and we are not appending, which
+                         * means no budget is needed at all.
+                         */
+                        return 0;
+                mutex_lock(&ui->ui_mutex);
+                if (ui->dirty)
+                        /*
+                         * The page is dirty and we are appending, so the inode
+                         * has to be marked as dirty. However, it is already
+                         * dirty, so we do not need any budget. We may return,
+                         * but @ui->ui_mutex hast to be left locked because we
+                         * should prevent write-back from flushing the inode
+                         * and freeing the budget. The lock will be released in
+                         * 'ubifs_write_end()'.
+                         */
+                        return 0;
+                /*
+                 * The page is dirty, we are appending, the inode is clean, so
+                 * we need to budget the inode change.
+                 */
+                req.dirtied_ino = 1;
+        } else {
+                if (PageChecked(page))
+                        /*
+                         * The page corresponds to a hole and does not
+                         * exist on the media. So changing it makes
+                         * make the amount of indexing information
+                         * larger, and we have to budget for a new
+                         * page.
+                         */
+                        req.new_page = 1;
+                else
+                        /*
+                         * Not a hole, the change will not add any new
+                         * indexing information, budget for page
+                         * change.
+                         */
+                        req.dirtied_page = 1;
+                if (appending) {
+                        mutex_lock(&ui->ui_mutex);
+                        if (!ui->dirty)
+                                /*
+                                 * The inode is clean but we will have to mark
+                                 * it as dirty because we are appending. This
+                                 * needs a budget.
+                                 */
+                                req.dirtied_ino = 1;
+                }
+        }
+        return ubifs_budget_space(c, &req);
+}
+/*
+ * This function is called when a page of data is going to be written. Since
+ * the page of data will not necessarily go to the flash straight away, UBIFS
+ * has to reserve space on the media for it, which is done by means of
+ * budgeting.
+ *
+ * This is the hot-path of the file-system and we are trying to optimize it as
+ * much as possible. For this reasons it is split on 2 parts - slow and fast.
+ *
+ * There many budgeting cases:
+ *     o a new page is appended - we have to budget for a new page and for
+ *       changing the inode; however, if the inode is already dirty, there is
+ *       no need to budget for it;
+ *     o an existing clean page is changed - we have budget for it; if the page
+ *       does not exist on the media (a hole), we have to budget for a new
+ *       page; otherwise, we may budget for changing an existing page; the
+ *       difference between these cases is that changing an existing page does
+ *       not introduce anything new to the FS indexing information, so it does
+ *       not grow, and smaller budget is acquired in this case;
+ *     o an existing dirty page is changed - no need to budget at all, because
+ *       the page budget has been acquired by earlier, when the page has been
+ *       marked dirty.
+ *
+ * UBIFS budgeting sub-system may force write-back if it thinks there is no
+ * space to reserve. This imposes some locking restrictions and makes it
+ * impossible to take into account the above cases, and makes it impossible to
+ * optimize budgeting.
+ *
+ * The solution for this is that the fast path of 'ubifs_write_begin()' assumes
+ * there is a plenty of flash space and the budget will be acquired quickly,
+ * without forcing write-back. The slow path does not make this assumption.
+ */
+static int ubifs_write_begin(struct file *file, struct address_space *mapping,
+                             loff_t pos, unsigned len, unsigned flags,
+                             struct page **pagep, void **fsdata)
+{
+        struct inode *inode = mapping->host;
+        struct ubifs_info *c = inode->i_sb->s_fs_info;
+        struct ubifs_inode *ui = ubifs_inode(inode);
+        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+        int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
+        struct page *page;
+        ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size);
+        if (unlikely(c->ro_media))
+                return -EROFS;
+        /* Try out the fast-path part first */
+        page = __grab_cache_page(mapping, index);
+        if (unlikely(!page))
+                return -ENOMEM;
+        if (!PageUptodate(page)) {
+                /* The page is not loaded from the flash */
+                if (!(pos & PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
+                        /*
+                         * We change whole page so no need to load it. But we
+                         * have to set the @PG_checked flag to make the further
+                         * code the page is new. This might be not true, but it
+                         * is better to budget more that to read the page from
+                         * the media.
+                         */
+                        SetPageChecked(page);
+                else {
+                        err = do_readpage(page);
+                        if (err) {
+                                unlock_page(page);
+                                page_cache_release(page);
+                                return err;
+                        }
+                }
+                SetPageUptodate(page);
+                ClearPageError(page);
+        }
+        err = allocate_budget(c, page, ui, appending);
+        if (unlikely(err)) {
+                ubifs_assert(err == -ENOSPC);
+                /*
+                 * Budgeting failed which means it would have to force
+                 * write-back but didn't, because we set the @fast flag in the
+                 * request. Write-back cannot be done now, while we have the
+                 * page locked, because it would deadlock. Unlock and free
+                 * everything and fall-back to slow-path.
+                 */
+                if (appending) {
+                        ubifs_assert(mutex_is_locked(&ui->ui_mutex));
+                        mutex_unlock(&ui->ui_mutex);
+                }
+                unlock_page(page);
+                page_cache_release(page);
+                return write_begin_slow(mapping, pos, len, pagep);
+        }
+        /*
+         * Whee, we aquired budgeting quickly - without involving
+         * garbage-collection, committing or forceing write-back. We return
+         * with @ui->ui_mutex locked if we are appending pages, and unlocked
+         * otherwise. This is an optimization (slightly hacky though).
+         */
+        *pagep = page;
+        return 0;
+}
+/**
+ * cancel_budget - cancel budget.
+ * @c: UBIFS file-system description object
+ * @page: page to cancel budget for
+ * @ui: UBIFS inode object the page belongs to
+ * @appending: non-zero if the page is appended
+ *
+ * This is a helper function for a page write operation. It unlocks the
+ * @ui->ui_mutex in case of appending.
+ */
+static void cancel_budget(struct ubifs_info *c, struct page *page,
+                          struct ubifs_inode *ui, int appending)
+{
+        if (appending) {
+                if (!ui->dirty)
+                        ubifs_release_dirty_inode_budget(c, ui);
+                mutex_unlock(&ui->ui_mutex);
+        }
+        if (!PagePrivate(page)) {
+                if (PageChecked(page))
+                        release_new_page_budget(c);
+                else
+                        release_existing_page_budget(c);
+        }
+}
+static int ubifs_write_end(struct file *file, struct address_space *mapping,
+                           loff_t pos, unsigned len, unsigned copied,
+                           struct page *page, void *fsdata)
+{
+        struct inode *inode = mapping->host;
+        struct ubifs_inode *ui = ubifs_inode(inode);
+        struct ubifs_info *c = inode->i_sb->s_fs_info;
+        loff_t end_pos = pos + len;
+        int appending = !!(end_pos > inode->i_size);
+        dbg_gen("ino %lu, pos %llu, pg %lu, len %u, copied %d, i_size %lld",
+                inode->i_ino, pos, page->index, len, copied, inode->i_size);
+        if (unlikely(copied < len && len == PAGE_CACHE_SIZE)) {
+                /*
+                 * VFS copied less data to the page that it intended and
+                 * declared in its '->write_begin()' call via the @len
+                 * argument. If the page was not up-to-date, and @len was
+                 * @PAGE_CACHE_SIZE, the 'ubifs_write_begin()' function did
+                 * not load it from the media (for optimization reasons). This
+                 * means that part of the page contains garbage. So read the
+                 * page now.
+                 */
+                dbg_gen("copied %d instead of %d, read page and repeat",
+                        copied, len);
+                cancel_budget(c, page, ui, appending);
+                /*
+                 * Return 0 to force VFS to repeat the whole operation, or the
+                 * error code if 'do_readpage()' failes.
+                 */
+                copied = do_readpage(page);
+                goto out;
+        }
+        if (!PagePrivate(page)) {
+                SetPagePrivate(page);
+                atomic_long_inc(&c->dirty_pg_cnt);
+                __set_page_dirty_nobuffers(page);
+        }
+        if (appending) {
+                i_size_write(inode, end_pos);
+                ui->ui_size = end_pos;
+                /*
+                 * Note, we do not set @I_DIRTY_PAGES (which means that the
+                 * inode has dirty pages), this has been done in
+                 * '__set_page_dirty_nobuffers()'.
+                 */
+                __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+                ubifs_assert(mutex_is_locked(&ui->ui_mutex));
+                mutex_unlock(&ui->ui_mutex);
+        }
+out:
+        unlock_page(page);
+        page_cache_release(page);
+        return copied;
+}
+static int ubifs_readpage(struct file *file, struct page *page)
+{
+        do_readpage(page);
+        unlock_page(page);
+        return 0;
+}
+static int do_writepage(struct page *page, int len)
+{
+        int err = 0, i, blen;
+        unsigned int block;
+        void *addr;
+        union ubifs_key key;
+        struct inode *inode = page->mapping->host;
+        struct ubifs_info *c = inode->i_sb->s_fs_info;
+#ifdef UBIFS_DEBUG
+        spin_lock(&ui->ui_lock);
+        ubifs_assert(page->index <= ui->synced_i_size << PAGE_CACHE_SIZE);
+        spin_unlock(&ui->ui_lock);
+#endif
+        /* Update radix tree tags */
+        set_page_writeback(page);
+        addr = kmap(page);
+        block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT;
+        i = 0;
+        while (len) {
+                blen = min_t(int, len, UBIFS_BLOCK_SIZE);
+                data_key_init(c, &key, inode->i_ino, block);
+                err = ubifs_jnl_write_data(c, inode, &key, addr, blen);
+                if (err)
+                        break;
+                if (++i >= UBIFS_BLOCKS_PER_PAGE)
+                        break;
+                block += 1;
+                addr += blen;
+                len -= blen;
+        }
+        if (err) {
+                SetPageError(page);
+                ubifs_err("cannot write page %lu of inode %lu, error %d",
+                          page->index, inode->i_ino, err);
+                ubifs_ro_mode(c, err);
+        }
+        ubifs_assert(PagePrivate(page));
+        if (PageChecked(page))
+                release_new_page_budget(c);
+        else
+                release_existing_page_budget(c);
+        atomic_long_dec(&c->dirty_pg_cnt);
+        ClearPagePrivate(page);
+        ClearPageChecked(page);
+        kunmap(page);
+        unlock_page(page);
+        end_page_writeback(page);
+        return err;
+}
+/*
+ * When writing-back dirty inodes, VFS first writes-back pages belonging to the
+ * inode, then the inode itself. For UBIFS this may cause a problem. Consider a
+ * situation when a we have an inode with size 0, then a megabyte of data is
+ * appended to the inode, then write-back starts and flushes some amount of the
+ * dirty pages, the journal becomes full, commit happens and finishes, and then
+ * an unclean reboot happens. When the file system is mounted next time, the
+ * inode size would still be 0, but there would be many pages which are beyond
+ * the inode size, they would be indexed and consume flash space. Because the
+ * journal has been committed, the replay would not be able to detect this
+ * situation and correct the inode size. This means UBIFS would have to scan
+ * whole index and correct all inode sizes, which is long an unacceptable.
+ *
+ * To prevent situations like this, UBIFS writes pages back only if they are
+ * within last synchronized inode size, i.e. the the size which has been
+ * written to the flash media last time. Otherwise, UBIFS forces inode
+ * write-back, thus making sure the on-flash inode contains current inode size,
+ * and then keeps writing pages back.
+ *
+ * Some locking issues explanation. 'ubifs_writepage()' first is called with
+ * the page locked, and it locks @ui_mutex. However, write-back does take inode
+ * @i_mutex, which means other VFS operations may be run on this inode at the
+ * same time. And the problematic one is truncation to smaller size, from where
+ * we have to call 'vmtruncate()', which first changes @inode->i_size, then
+ * drops the truncated pages. And while dropping the pages, it takes the page
+ * lock. This means that 'do_truncation()' cannot call 'vmtruncate()' with
+ * @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. This
+ * means that @inode->i_size is changed while @ui_mutex is unlocked.
+ *
+ * But in 'ubifs_writepage()' we have to guarantee that we do not write beyond
+ * inode size. How do we do this if @inode->i_size may became smaller while we
+ * are in the middle of 'ubifs_writepage()'? The UBIFS solution is the
+ * @ui->ui_isize "shadow" field which UBIFS uses instead of @inode->i_size
+ * internally and updates it under @ui_mutex.
+ *
+ * Q: why we do not worry that if we race with truncation, we may end up with a
+ * situation when the inode is truncated while we are in the middle of
+ * 'do_writepage()', so we do write beyond inode size?
+ * A: If we are in the middle of 'do_writepage()', truncation would be locked
+ * on the page lock and it would not write the truncated inode node to the
+ * journal before we have finished.
+ */
+static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
+{
+        struct inode *inode = page->mapping->host;
+        struct ubifs_inode *ui = ubifs_inode(inode);
+        loff_t i_size =  i_size_read(inode), synced_i_size;
+        pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+        int err, len = i_size & (PAGE_CACHE_SIZE - 1);
+        void *kaddr;
+        dbg_gen("ino %lu, pg %lu, pg flags %#lx",
+                inode->i_ino, page->index, page->flags);
+        ubifs_assert(PagePrivate(page));
+        /* Is the page fully outside @i_size? (truncate in progress) */
+        if (page->index > end_index || (page->index == end_index && !len)) {
+                err = 0;
+                goto out_unlock;
+        }
+        spin_lock(&ui->ui_lock);
+        synced_i_size = ui->synced_i_size;
+        spin_unlock(&ui->ui_lock);
+        /* Is the page fully inside @i_size? */
+        if (page->index < end_index) {
+                if (page->index >= synced_i_size >> PAGE_CACHE_SHIFT) {
+                        err = inode->i_sb->s_op->write_inode(inode, 1);
+                        if (err)
+                                goto out_unlock;
+                        /*
+                         * The inode has been written, but the write-buffer has
+                         * not been synchronized, so in case of an unclean
+                         * reboot we may end up with some pages beyond inode
+                         * size, but they would be in the journal (because
+                         * commit flushes write buffers) and recovery would deal
+                         * with this.
+                         */
+                }
+                return do_writepage(page, PAGE_CACHE_SIZE);
+        }
+        /*
+         * The page straddles @i_size. It must be zeroed out on each and every
+         * writepage invocation because it may be mmapped. "A file is mapped
+         * in multiples of the page size. For a file that is not a multiple of
+         * the page size, the remaining memory is zeroed when mapped, and
+         * writes to that region are not written out to the file."
+         */
+        kaddr = kmap_atomic(page, KM_USER0);
+        memset(kaddr + len, 0, PAGE_CACHE_SIZE - len);
+        flush_dcache_page(page);
+        kunmap_atomic(kaddr, KM_USER0);
+        if (i_size > synced_i_size) {
+                err = inode->i_sb->s_op->write_inode(inode, 1);
+                if (err)
+                        goto out_unlock;
+        }
+        return do_writepage(page, len);
+out_unlock:
+        unlock_page(page);
+        return err;
+}
+/**
+ * do_attr_changes - change inode attributes.
+ * @inode: inode to change attributes for
+ * @attr: describes attributes to change
+ */
+static void do_attr_changes(struct inode *inode, const struct iattr *attr)
+{
+        if (attr->ia_valid & ATTR_UID)
+                inode->i_uid = attr->ia_uid;
+        if (attr->ia_valid & ATTR_GID)
+                inode->i_gid = attr->ia_gid;
+        if (attr->ia_valid & ATTR_ATIME)
+                inode->i_atime = timespec_trunc(attr->ia_atime,
+                                                inode->i_sb->s_time_gran);
+        if (attr->ia_valid & ATTR_MTIME)
+                inode->i_mtime = timespec_trunc(attr->ia_mtime,
+                                                inode->i_sb->s_time_gran);
+        if (attr->ia_valid & ATTR_CTIME)
+                inode->i_ctime = timespec_trunc(attr->ia_ctime,
+                                                inode->i_sb->s_time_gran);
+        if (attr->ia_valid & ATTR_MODE) {
+                umode_t mode = attr->ia_mode;
+                if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+                        mode &= ~S_ISGID;
+                inode->i_mode = mode;
+        }
+}
+/**
+ * do_truncation - truncate an inode.
+ * @c: UBIFS file-system description object
+ * @inode: inode to truncate
+ * @attr: inode attribute changes description
+ *
+ * This function implements VFS '->setattr()' call when the inode is truncated
+ * to a smaller size. Returns zero in case of success and a negative error code
+ * in case of failure.
+ */
+static int do_truncation(struct ubifs_info *c, struct inode *inode,
+                         const struct iattr *attr)
+{
+        int err;
+        struct ubifs_budget_req req;
+        loff_t old_size = inode->i_size, new_size = attr->ia_size;
+        int offset = new_size & (UBIFS_BLOCK_SIZE - 1);
+        struct ubifs_inode *ui = ubifs_inode(inode);
+        dbg_gen("ino %lu, size %lld -> %lld", inode->i_ino, old_size, new_size);
+        memset(&req, 0, sizeof(struct ubifs_budget_req));
+        /*
+         * If this is truncation to a smaller size, and we do not truncate on a
+         * block boundary, budget for changing one data block, because the last
+         * block will be re-written.
+         */
+        if (new_size & (UBIFS_BLOCK_SIZE - 1))
+                req.dirtied_page = 1;
+        req.dirtied_ino = 1;
+        /* A funny way to budget for truncation node */
+        req.dirtied_ino_d = UBIFS_TRUN_NODE_SZ;
+        err = ubifs_budget_space(c, &req);
+        if (err)
+                return err;
+        err = vmtruncate(inode, new_size);
+        if (err)
+                goto out_budg;
+        if (offset) {
+                pgoff_t index = new_size >> PAGE_CACHE_SHIFT;
+                struct page *page;
+                page = find_lock_page(inode->i_mapping, index);
+                if (page) {
+                        if (PageDirty(page)) {
+                                /*
+                                 * 'ubifs_jnl_truncate()' will try to truncate
+                                 * the last data node, but it contains
+                                 * out-of-date data because the page is dirty.
+                                 * Write the page now, so that
+                                 * 'ubifs_jnl_truncate()' will see an already
+                                 * truncated (and up to date) data node.
+                                 */
+                                ubifs_assert(PagePrivate(page));
+                                clear_page_dirty_for_io(page);
+                                if (UBIFS_BLOCKS_PER_PAGE_SHIFT)
+                                        offset = new_size &
+                                                 (PAGE_CACHE_SIZE - 1);
+                                err = do_writepage(page, offset);
+                                page_cache_release(page);
+                                if (err)
+                                        goto out_budg;
+                                /*
+                                 * We could now tell 'ubifs_jnl_truncate()' not
+                                 * to read the last block.
+                                 */
+                        } else {
+                                /*
+                                 * We could 'kmap()' the page and pass the data
+                                 * to 'ubifs_jnl_truncate()' to save it from
+                                 * having to read it.
+                                 */
+                                unlock_page(page);
+                                page_cache_release(page);
+                        }
+                }
+        }
+        mutex_lock(&ui->ui_mutex);
+        ui->ui_size = inode->i_size;
+        /* Truncation changes inode [mc]time */
+        inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
+        /* The other attributes may be changed at the same time as well */
+        do_attr_changes(inode, attr);
+        err = ubifs_jnl_truncate(c, inode, old_size, new_size);
+        mutex_unlock(&ui->ui_mutex);
+out_budg:
+        ubifs_release_budget(c, &req);
+        return err;
+}
+/**
+ * do_setattr - change inode attributes.
+ * @c: UBIFS file-system description object
+ * @inode: inode to change attributes for
+ * @attr: inode attribute changes description
+ *
+ * This function implements VFS '->setattr()' call for all cases except
+ * truncations to smaller size. Returns zero in case of success and a negative
+ * error code in case of failure.
+ */
+static int do_setattr(struct ubifs_info *c, struct inode *inode,
+                      const struct iattr *attr)
+{
+        int err, release;
+        loff_t new_size = attr->ia_size;
+        struct ubifs_inode *ui = ubifs_inode(inode);
+        struct ubifs_budget_req req = { .dirtied_ino = 1,
+                                        .dirtied_ino_d = ui->data_len };
+        err = ubifs_budget_space(c, &req);
+        if (err)
+                return err;
+        if (attr->ia_valid & ATTR_SIZE) {
+                dbg_gen("size %lld -> %lld", inode->i_size, new_size);
+                err = vmtruncate(inode, new_size);
+                if (err)
+                        goto out;
+        }
+        mutex_lock(&ui->ui_mutex);
+        if (attr->ia_valid & ATTR_SIZE) {
+                /* Truncation changes inode [mc]time */
+                inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
+                /* 'vmtruncate()' changed @i_size, update @ui_size */
+                ui->ui_size = inode->i_size;
+        }
+        do_attr_changes(inode, attr);
+        release = ui->dirty;
+        if (attr->ia_valid & ATTR_SIZE)
+                /*
+                 * Inode length changed, so we have to make sure
+                 * @I_DIRTY_DATASYNC is set.
+                 */
+                 __mark_inode_dirty(inode, I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+        else
+                mark_inode_dirty_sync(inode);
+        mutex_unlock(&ui->ui_mutex);
+        if (release)
+                ubifs_release_budget(c, &req);
+        if (IS_SYNC(inode))
+                err = inode->i_sb->s_op->write_inode(inode, 1);
+        return err;
+out:
+        ubifs_release_budget(c, &req);
+        return err;
+}
+int ubifs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+        int err;
+        struct inode *inode = dentry->d_inode;
+        struct ubifs_info *c = inode->i_sb->s_fs_info;
+        dbg_gen("ino %lu, ia_valid %#x", inode->i_ino, attr->ia_valid);
+        err = inode_change_ok(inode, attr);
+        if (err)
+                return err;
+        err = dbg_check_synced_i_size(inode);
+        if (err)
+                return err;
+        if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size < inode->i_size)
+                /* Truncation to a smaller size */
+                err = do_truncation(c, inode, attr);
+        else
+                err = do_setattr(c, inode, attr);
+        return err;
+}
+static void ubifs_invalidatepage(struct page *page, unsigned long offset)
+{
+        struct inode *inode = page->mapping->host;
+        struct ubifs_info *c = inode->i_sb->s_fs_info;
+        ubifs_assert(PagePrivate(page));
+        if (offset)
+                /* Partial page remains dirty */
+                return;
+        if (PageChecked(page))
+                release_new_page_budget(c);
+        else
+                release_existing_page_budget(c);
+        atomic_long_dec(&c->dirty_pg_cnt);
+        ClearPagePrivate(page);
+        ClearPageChecked(page);
+}
+static void *ubifs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+        struct ubifs_inode *ui = ubifs_inode(dentry->d_inode);
+        nd_set_link(nd, ui->data);
+        return NULL;
+}
+int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+        struct inode *inode = dentry->d_inode;
+        struct ubifs_info *c = inode->i_sb->s_fs_info;
+        int err;
+        dbg_gen("syncing inode %lu", inode->i_ino);
+        /*
+         * VFS has already synchronized dirty pages for this inode. Synchronize
+         * the inode unless this is a 'datasync()' call.
+         */
+        if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) {
+                err = inode->i_sb->s_op->write_inode(inode, 1);
+                if (err)
+                        return err;
+        }
+        /*
+         * Nodes related to this inode may still sit in a write-buffer. Flush
+         * them.
+         */
+        err = ubifs_sync_wbufs_by_inode(c, inode);
+        if (err)
+                return err;
+        return 0;
+}
+/**
+ * mctime_update_needed - check if mtime or ctime update is needed.
+ * @inode: the inode to do the check for
+ * @now: current time
+ *
+ * This helper function checks if the inode mtime/ctime should be updated or
+ * not. If current values of the time-stamps are within the UBIFS inode time
+ * granularity, they are not updated. This is an optimization.
+ */
+static inline int mctime_update_needed(const struct inode *inode,
+                                       const struct timespec *now)
+{
+        if (!timespec_equal(&inode->i_mtime, now) ||
+            !timespec_equal(&inode->i_ctime, now))
+                return 1;
+        return 0;
+}
+/**
+ * update_ctime - update mtime and ctime of an inode.
+ * @c: UBIFS file-system description object
+ * @inode: inode to update
+ *
+ * This function updates mtime and ctime of the inode if it is not equivalent to
+ * current time. Returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int update_mctime(struct ubifs_info *c, struct inode *inode)
+{
+        struct timespec now = ubifs_current_time(inode);
+        struct ubifs_inode *ui = ubifs_inode(inode);
+        if (mctime_update_needed(inode, &now)) {
+                int err, release;
+                struct ubifs_budget_req req = { .dirtied_ino = 1,
+                                                .dirtied_ino_d = ui->data_len };
+                err = ubifs_budget_space(c, &req);
+                if (err)
+                        return err;
+                mutex_lock(&ui->ui_mutex);
+                inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
+                release = ui->dirty;
+                mark_inode_dirty_sync(inode);
+                mutex_unlock(&ui->ui_mutex);
+                if (release)
+                        ubifs_release_budget(c, &req);
+        }
+        return 0;
+}
+static ssize_t ubifs_aio_write(struct kiocb *iocb, const struct iovec *iov,
+                               unsigned long nr_segs, loff_t pos)
+{
+        int err;
+        ssize_t ret;
+        struct inode *inode = iocb->ki_filp->f_mapping->host;
+        struct ubifs_info *c = inode->i_sb->s_fs_info;
+        err = update_mctime(c, inode);
+        if (err)
+                return err;
+        ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
+        if (ret < 0)
+                return ret;
+        if (ret > 0 && (IS_SYNC(inode) || iocb->ki_filp->f_flags & O_SYNC)) {
+                err = ubifs_sync_wbufs_by_inode(c, inode);
+                if (err)
+                        return err;
+        }
+        return ret;
+}
+static int ubifs_set_page_dirty(struct page *page)
+{
+        int ret;
+        ret = __set_page_dirty_nobuffers(page);
+        /*
+         * An attempt to dirty a page without budgeting for it - should not
+         * happen.
+         */
+        ubifs_assert(ret == 0);
+        return ret;
+}
+static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags)
+{
+        /*
+         * An attempt to release a dirty page without budgeting for it - should
+         * not happen.
+         */
+        if (PageWriteback(page))
+                return 0;
+        ubifs_assert(PagePrivate(page));
+        ubifs_assert(0);
+        ClearPagePrivate(page);
+        ClearPageChecked(page);
+        return 1;
+}
+/*
+ * mmap()d file has taken write protection fault and is being made
+ * writable. UBIFS must ensure page is budgeted for.
+ */
+static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+        struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+        struct ubifs_info *c = inode->i_sb->s_fs_info;
+        struct timespec now = ubifs_current_time(inode);
+        struct ubifs_budget_req req = { .new_page = 1 };
+        int err, update_time;
+        dbg_gen("ino %lu, pg %lu, i_size %lld", inode->i_ino, page->index,
+                i_size_read(inode));
+        ubifs_assert(!(inode->i_sb->s_flags & MS_RDONLY));
+        if (unlikely(c->ro_media))
+                return -EROFS;
+        /*
+         * We have not locked @page so far so we may budget for changing the
+         * page. Note, we cannot do this after we locked the page, because
+         * budgeting may cause write-back which would cause deadlock.
+         *
+         * At the moment we do not know whether the page is dirty or not, so we
+         * assume that it is not and budget for a new page. We could look at
+         * the @PG_private flag and figure this out, but we may race with write
+         * back and the page state may change by the time we lock it, so this
+         * would need additional care. We do not bother with this at the
+         * moment, although it might be good idea to do. Instead, we allocate
+         * budget for a new page and amend it later on if the page was in fact
+         * dirty.
+         *
+         * The budgeting-related logic of this function is similar to what we
+         * do in 'ubifs_write_begin()' and 'ubifs_write_end()'. Glance there
+         * for more comments.
+         */
+        update_time = mctime_update_needed(inode, &now);
+        if (update_time)
+                /*
+                 * We have to change inode time stamp which requires extra
+                 * budgeting.
+                 */
+                req.dirtied_ino = 1;
+        err = ubifs_budget_space(c, &req);
+        if (unlikely(err)) {
+                if (err == -ENOSPC)
+                        ubifs_warn("out of space for mmapped file "
+                                   "(inode number %lu)", inode->i_ino);
+                return err;
+        }
+        lock_page(page);
+        if (unlikely(page->mapping != inode->i_mapping ||
+                     page_offset(page) > i_size_read(inode))) {
+                /* Page got truncated out from underneath us */
+                err = -EINVAL;
+                goto out_unlock;
+        }
+        if (PagePrivate(page))
+                release_new_page_budget(c);
+        else {
+                if (!PageChecked(page))
+                        ubifs_convert_page_budget(c);
+                SetPagePrivate(page);
+                atomic_long_inc(&c->dirty_pg_cnt);
+                __set_page_dirty_nobuffers(page);
+        }
+        if (update_time) {
+                int release;
+                struct ubifs_inode *ui = ubifs_inode(inode);
+                mutex_lock(&ui->ui_mutex);
+                inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
+                release = ui->dirty;
+                mark_inode_dirty_sync(inode);
+                mutex_unlock(&ui->ui_mutex);
+                if (release)
+                        ubifs_release_dirty_inode_budget(c, ui);
+        }
+        unlock_page(page);
+        return 0;
+out_unlock:
+        unlock_page(page);
+        ubifs_release_budget(c, &req);
+        return err;
+}
+static struct vm_operations_struct ubifs_file_vm_ops = {
+        .fault        = filemap_fault,
+        .page_mkwrite = ubifs_vm_page_mkwrite,
+};
+static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+        int err;
+        /* 'generic_file_mmap()' takes care of NOMMU case */
+        err = generic_file_mmap(file, vma);
+        if (err)
+                return err;
+        vma->vm_ops = &ubifs_file_vm_ops;
+        return 0;
+}
+struct address_space_operations ubifs_file_address_operations = {
+        .readpage       = ubifs_readpage,
+        .writepage      = ubifs_writepage,
+        .write_begin    = ubifs_write_begin,
+        .write_end      = ubifs_write_end,
+        .invalidatepage = ubifs_invalidatepage,
+        .set_page_dirty = ubifs_set_page_dirty,
+        .releasepage    = ubifs_releasepage,
+};
+struct inode_operations ubifs_file_inode_operations = {
+        .setattr     = ubifs_setattr,
+        .getattr     = ubifs_getattr,
+#ifdef CONFIG_UBIFS_FS_XATTR
+        .setxattr    = ubifs_setxattr,
+        .getxattr    = ubifs_getxattr,
+        .listxattr   = ubifs_listxattr,
+        .removexattr = ubifs_removexattr,
+#endif
+};
+struct inode_operations ubifs_symlink_inode_operations = {
+        .readlink    = generic_readlink,
+        .follow_link = ubifs_follow_link,
+        .setattr     = ubifs_setattr,
+        .getattr     = ubifs_getattr,
+};
+struct file_operations ubifs_file_operations = {
+        .llseek         = generic_file_llseek,
+        .read           = do_sync_read,
+        .write          = do_sync_write,
+        .aio_read       = generic_file_aio_read,
+        .aio_write      = ubifs_aio_write,
+        .mmap           = ubifs_file_mmap,
+        .fsync          = ubifs_fsync,
+        .unlocked_ioctl = ubifs_ioctl,
+        .splice_read    = generic_file_splice_read,
+#ifdef CONFIG_COMPAT
+        .compat_ioctl   = ubifs_compat_ioctl,
+#endif
+};
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
new file mode 100644
index 000000000000..10394c548367
--- /dev/null
+++ b/fs/ubifs/find.c
@@ -0,0 +1,975 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+/*
+ * This file contains functions for finding LEBs for various purposes e.g.
+ * garbage collection. In general, lprops category heaps and lists are used
+ * for fast access, falling back on scanning the LPT as a last resort.
+ */
+#include <linux/sort.h>
+#include "ubifs.h"
+/**
+ * struct scan_data - data provided to scan callback functions
+ * @min_space: minimum number of bytes for which to scan
+ * @pick_free: whether it is OK to scan for empty LEBs
+ * @lnum: LEB number found is returned here
+ * @exclude_index: whether to exclude index LEBs
+ */
+struct scan_data {
+        int min_space;
+        int pick_free;
+        int lnum;
+        int exclude_index;
+};
+/**
+ * valuable - determine whether LEB properties are valuable.
+ * @c: the UBIFS file-system description object
+ * @lprops: LEB properties
+ *
+ * This function return %1 if the LEB properties should be added to the LEB
+ * properties tree in memory. Otherwise %0 is returned.
+ */
+static int valuable(struct ubifs_info *c, const struct ubifs_lprops *lprops)
+{
+        int n, cat = lprops->flags & LPROPS_CAT_MASK;
+        struct ubifs_lpt_heap *heap;
+        switch (cat) {
+        case LPROPS_DIRTY:
+        case LPROPS_DIRTY_IDX:
+        case LPROPS_FREE:
+                heap = &c->lpt_heap[cat - 1];
+                if (heap->cnt < heap->max_cnt)
+                        return 1;
+                if (lprops->free + lprops->dirty >= c->dark_wm)
+                        return 1;
+                return 0;
+        case LPROPS_EMPTY:
+                n = c->lst.empty_lebs + c->freeable_cnt -
+                    c->lst.taken_empty_lebs;
+                if (n < c->lsave_cnt)
+                        return 1;
+                return 0;
+        case LPROPS_FREEABLE:
+                return 1;
+        case LPROPS_FRDI_IDX:
+                return 1;
+        }
+        return 0;
+}
+/**
+ * scan_for_dirty_cb - dirty space scan callback.
+ * @c: the UBIFS file-system description object
+ * @lprops: LEB properties to scan
+ * @in_tree: whether the LEB properties are in main memory
+ * @data: information passed to and from the caller of the scan
+ *
+ * This function returns a code that indicates whether the scan should continue
+ * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
+ * in main memory (%LPT_SCAN_ADD), or whether the scan should stop
+ * (%LPT_SCAN_STOP).
+ */
+static int scan_for_dirty_cb(struct ubifs_info *c,
+                             const struct ubifs_lprops *lprops, int in_tree,
+                             struct scan_data *data)
+{
+        int ret = LPT_SCAN_CONTINUE;
+        /* Exclude LEBs that are currently in use */
+        if (lprops->flags & LPROPS_TAKEN)
+                return LPT_SCAN_CONTINUE;
+        /* Determine whether to add these LEB properties to the tree */
+        if (!in_tree && valuable(c, lprops))
+                ret |= LPT_SCAN_ADD;
+        /* Exclude LEBs with too little space */
+        if (lprops->free + lprops->dirty < data->min_space)
+                return ret;
+        /* If specified, exclude index LEBs */
+        if (data->exclude_index && lprops->flags & LPROPS_INDEX)
+                return ret;
+        /* If specified, exclude empty or freeable LEBs */
+        if (lprops->free + lprops->dirty == c->leb_size) {
+                if (!data->pick_free)
+                        return ret;
+        /* Exclude LEBs with too little dirty space (unless it is empty) */
+        } else if (lprops->dirty < c->dead_wm)
+                return ret;
+        /* Finally we found space */
+        data->lnum = lprops->lnum;
+        return LPT_SCAN_ADD | LPT_SCAN_STOP;
+}
+/**
+ * scan_for_dirty - find a data LEB with free space.
+ * @c: the UBIFS file-system description object
+ * @min_space: minimum amount free plus dirty space the returned LEB has to
+ *             have
+ * @pick_free: if it is OK to return a free or freeable LEB
+ * @exclude_index: whether to exclude index LEBs
+ *
+ * This function returns a pointer to the LEB properties found or a negative
+ * error code.
+ */
+static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c,
+                                                 int min_space, int pick_free,
+                                                 int exclude_index)
+{
+        const struct ubifs_lprops *lprops;
+        struct ubifs_lpt_heap *heap;
+        struct scan_data data;
+        int err, i;
+        /* There may be an LEB with enough dirty space on the free heap */
+        heap = &c->lpt_heap[LPROPS_FREE - 1];
+        for (i = 0; i < heap->cnt; i++) {
+                lprops = heap->arr[i];
+                if (lprops->free + lprops->dirty < min_space)
+                        continue;
+                if (lprops->dirty < c->dead_wm)
+                        continue;
+                return lprops;
+        }
+        /*
+         * A LEB may have fallen off of the bottom of the dirty heap, and ended
+         * up as uncategorized even though it has enough dirty space for us now,
+         * so check the uncategorized list. N.B. neither empty nor freeable LEBs
+         * can end up as uncategorized because they are kept on lists not
+         * finite-sized heaps.
+         */
+        list_for_each_entry(lprops, &c->uncat_list, list) {
+                if (lprops->flags & LPROPS_TAKEN)
+                        continue;
+                if (lprops->free + lprops->dirty < min_space)
+                        continue;
+                if (exclude_index && (lprops->flags & LPROPS_INDEX))
+                        continue;
+                if (lprops->dirty < c->dead_wm)
+                        continue;
+                return lprops;
+        }
+        /* We have looked everywhere in main memory, now scan the flash */
+        if (c->pnodes_have >= c->pnode_cnt)
+                /* All pnodes are in memory, so skip scan */
+                return ERR_PTR(-ENOSPC);
+        data.min_space = min_space;
+        data.pick_free = pick_free;
+        data.lnum = -1;
+        data.exclude_index = exclude_index;
+        err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum,
+                                    (ubifs_lpt_scan_callback)scan_for_dirty_cb,
+                                    &data);
+        if (err)
+                return ERR_PTR(err);
+        ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt);
+        c->lscan_lnum = data.lnum;
+        lprops = ubifs_lpt_lookup_dirty(c, data.lnum);
+        if (IS_ERR(lprops))
+                return lprops;
+        ubifs_assert(lprops->lnum == data.lnum);
+        ubifs_assert(lprops->free + lprops->dirty >= min_space);
+        ubifs_assert(lprops->dirty >= c->dead_wm ||
+                     (pick_free &&
+                      lprops->free + lprops->dirty == c->leb_size));
+        ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+        ubifs_assert(!exclude_index || !(lprops->flags & LPROPS_INDEX));
+        return lprops;
+}
+/**
+ * ubifs_find_dirty_leb - find a dirty LEB for the Garbage Collector.
+ * @c: the UBIFS file-system description object
+ * @ret_lp: LEB properties are returned here on exit
+ * @min_space: minimum amount free plus dirty space the returned LEB has to
+ *             have
+ * @pick_free: controls whether it is OK to pick empty or index LEBs
+ *
+ * This function tries to find a dirty logical eraseblock which has at least
+ * @min_space free and dirty space. It prefers to take an LEB from the dirty or
+ * dirty index heap, and it falls-back to LPT scanning if the heaps are empty
+ * or do not have an LEB which satisfies the @min_space criteria.
+ *
+ * Note:
+ *   o LEBs which have less than dead watermark of dirty space are never picked
+ *   by this function;
+ *
+ * Returns zero and the LEB properties of
+ * found dirty LEB in case of success, %-ENOSPC if no dirty LEB was found and a
+ * negative error code in case of other failures. The returned LEB is marked as
+ * "taken".
+ *
+ * The additional @pick_free argument controls if this function has to return a
+ * free or freeable LEB if one is present. For example, GC must to set it to %1,
+ * when called from the journal space reservation function, because the
+ * appearance of free space may coincide with the loss of enough dirty space
+ * for GC to succeed anyway.
+ *
+ * In contrast, if the Garbage Collector is called from budgeting, it should
+ * just make free space, not return LEBs which are already free or freeable.
+ *
+ * In addition @pick_free is set to %2 by the recovery process in order to
+ * recover gc_lnum in which case an index LEB must not be returned.
+ */
+int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
+                         int min_space, int pick_free)
+{
+        int err = 0, sum, exclude_index = pick_free == 2 ? 1 : 0;
+        const struct ubifs_lprops *lp = NULL, *idx_lp = NULL;
+        struct ubifs_lpt_heap *heap, *idx_heap;
+        ubifs_get_lprops(c);
+        if (pick_free) {
+                int lebs, rsvd_idx_lebs = 0;
+                spin_lock(&c->space_lock);
+                lebs = c->lst.empty_lebs;
+                lebs += c->freeable_cnt - c->lst.taken_empty_lebs;
+                /*
+                 * Note, the index may consume more LEBs than have been reserved
+                 * for it. It is OK because it might be consolidated by GC.
+                 * But if the index takes fewer LEBs than it is reserved for it,
+                 * this function must avoid picking those reserved LEBs.
+                 */
+                if (c->min_idx_lebs >= c->lst.idx_lebs) {
+                        rsvd_idx_lebs = c->min_idx_lebs -  c->lst.idx_lebs;
+                        exclude_index = 1;
+                }
+                spin_unlock(&c->space_lock);
+                /* Check if there are enough free LEBs for the index */
+                if (rsvd_idx_lebs < lebs) {
+                        /* OK, try to find an empty LEB */
+                        lp = ubifs_fast_find_empty(c);
+                        if (lp)
+                                goto found;
+                        /* Or a freeable LEB */
+                        lp = ubifs_fast_find_freeable(c);
+                        if (lp)
+                                goto found;
+                } else
+                        /*
+                         * We cannot pick free/freeable LEBs in the below code.
+                         */
+                        pick_free = 0;
+        } else {
+                spin_lock(&c->space_lock);
+                exclude_index = (c->min_idx_lebs >= c->lst.idx_lebs);
+                spin_unlock(&c->space_lock);
+        }
+        /* Look on the dirty and dirty index heaps */
+        heap = &c->lpt_heap[LPROPS_DIRTY - 1];
+        idx_heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1];
+        if (idx_heap->cnt && !exclude_index) {
+                idx_lp = idx_heap->arr[0];
+                sum = idx_lp->free + idx_lp->dirty;
+                /*
+                 * Since we reserve twice as more space for the index than it
+                 * actually takes, it does not make sense to pick indexing LEBs
+                 * with less than half LEB of dirty space.
+                 */
+                if (sum < min_space || sum < c->half_leb_size)
+                        idx_lp = NULL;
+        }
+        if (heap->cnt) {
+                lp = heap->arr[0];
+                if (lp->dirty + lp->free < min_space)
+                        lp = NULL;
+        }
+        /* Pick the LEB with most space */
+        if (idx_lp && lp) {
+                if (idx_lp->free + idx_lp->dirty >= lp->free + lp->dirty)
+                        lp = idx_lp;
+        } else if (idx_lp && !lp)
+                lp = idx_lp;
+        if (lp) {
+                ubifs_assert(lp->dirty >= c->dead_wm);
+                goto found;
+        }
+        /* Did not find a dirty LEB on the dirty heaps, have to scan */
+        dbg_find("scanning LPT for a dirty LEB");
+        lp = scan_for_dirty(c, min_space, pick_free, exclude_index);
+        if (IS_ERR(lp)) {
+                err = PTR_ERR(lp);
+                goto out;
+        }
+        ubifs_assert(lp->dirty >= c->dead_wm ||
+                     (pick_free && lp->free + lp->dirty == c->leb_size));
+found:
+        dbg_find("found LEB %d, free %d, dirty %d, flags %#x",
+                 lp->lnum, lp->free, lp->dirty, lp->flags);
+        lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC,
+                             lp->flags | LPROPS_TAKEN, 0);
+        if (IS_ERR(lp)) {
+                err = PTR_ERR(lp);
+                goto out;
+        }
+        memcpy(ret_lp, lp, sizeof(struct ubifs_lprops));
+out:
+        ubifs_release_lprops(c);
+        return err;
+}
+/**
+ * scan_for_free_cb - free space scan callback.
+ * @c: the UBIFS file-system description object
+ * @lprops: LEB properties to scan
+ * @in_tree: whether the LEB properties are in main memory
+ * @data: information passed to and from the caller of the scan
+ *
+ * This function returns a code that indicates whether the scan should continue
+ * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
+ * in main memory (%LPT_SCAN_ADD), or whether the scan should stop
+ * (%LPT_SCAN_STOP).
+ */
+static int scan_for_free_cb(struct ubifs_info *c,
+                            const struct ubifs_lprops *lprops, int in_tree,
+                            struct scan_data *data)
+{
+        int ret = LPT_SCAN_CONTINUE;
+        /* Exclude LEBs that are currently in use */
+        if (lprops->flags & LPROPS_TAKEN)
+                return LPT_SCAN_CONTINUE;
+        /* Determine whether to add these LEB properties to the tree */
+        if (!in_tree && valuable(c, lprops))
+                ret |= LPT_SCAN_ADD;
+        /* Exclude index LEBs */
+        if (lprops->flags & LPROPS_INDEX)
+                return ret;
+        /* Exclude LEBs with too little space */
+        if (lprops->free < data->min_space)
+                return ret;
+        /* If specified, exclude empty LEBs */
+        if (!data->pick_free && lprops->free == c->leb_size)
+                return ret;
+        /*
+         * LEBs that have only free and dirty space must not be allocated
+         * because they may have been unmapped already or they may have data
+         * that is obsolete only because of nodes that are still sitting in a
+         * wbuf.
+         */
+        if (lprops->free + lprops->dirty == c->leb_size && lprops->dirty > 0)
+                return ret;
+        /* Finally we found space */
+        data->lnum = lprops->lnum;
+        return LPT_SCAN_ADD | LPT_SCAN_STOP;
+}
+/**
+ * do_find_free_space - find a data LEB with free space.
+ * @c: the UBIFS file-system description object
+ * @min_space: minimum amount of free space required
+ * @pick_free: whether it is OK to scan for empty LEBs
+ * @squeeze: whether to try to find space in a non-empty LEB first
+ *
+ * This function returns a pointer to the LEB properties found or a negative
+ * error code.
+ */
+static
+const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c,
+                                              int min_space, int pick_free,
+                                              int squeeze)
+{
+        const struct ubifs_lprops *lprops;
+        struct ubifs_lpt_heap *heap;
+        struct scan_data data;
+        int err, i;
+        if (squeeze) {
+                lprops = ubifs_fast_find_free(c);
+                if (lprops && lprops->free >= min_space)
+                        return lprops;
+        }
+        if (pick_free) {
+                lprops = ubifs_fast_find_empty(c);
+                if (lprops)
+                        return lprops;
+        }
+        if (!squeeze) {
+                lprops = ubifs_fast_find_free(c);
+                if (lprops && lprops->free >= min_space)
+                        return lprops;
+        }
+        /* There may be an LEB with enough free space on the dirty heap */
+        heap = &c->lpt_heap[LPROPS_DIRTY - 1];
+        for (i = 0; i < heap->cnt; i++) {
+                lprops = heap->arr[i];
+                if (lprops->free >= min_space)
+                        return lprops;
+        }
+        /*
+         * A LEB may have fallen off of the bottom of the free heap, and ended
+         * up as uncategorized even though it has enough free space for us now,
+         * so check the uncategorized list. N.B. neither empty nor freeable LEBs
+         * can end up as uncategorized because they are kept on lists not
+         * finite-sized heaps.
+         */
+        list_for_each_entry(lprops, &c->uncat_list, list) {
+                if (lprops->flags & LPROPS_TAKEN)
+                        continue;
+                if (lprops->flags & LPROPS_INDEX)
+                        continue;
+                if (lprops->free >= min_space)
+                        return lprops;
+        }
+        /* We have looked everywhere in main memory, now scan the flash */
+        if (c->pnodes_have >= c->pnode_cnt)
+                /* All pnodes are in memory, so skip scan */
+                return ERR_PTR(-ENOSPC);
+        data.min_space = min_space;
+        data.pick_free = pick_free;
+        data.lnum = -1;
+        err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum,
+                                    (ubifs_lpt_scan_callback)scan_for_free_cb,
+                                    &data);
+        if (err)
+                return ERR_PTR(err);
+        ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt);
+        c->lscan_lnum = data.lnum;
+        lprops = ubifs_lpt_lookup_dirty(c, data.lnum);
+        if (IS_ERR(lprops))
+                return lprops;
+        ubifs_assert(lprops->lnum == data.lnum);
+        ubifs_assert(lprops->free >= min_space);
+        ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+        ubifs_assert(!(lprops->flags & LPROPS_INDEX));
+        return lprops;
+}
+/**
+ * ubifs_find_free_space - find a data LEB with free space.
+ * @c: the UBIFS file-system description object
+ * @min_space: minimum amount of required free space
+ * @free: contains amount of free space in the LEB on exit
+ * @squeeze: whether to try to find space in a non-empty LEB first
+ *
+ * This function looks for an LEB with at least @min_space bytes of free space.
+ * It tries to find an empty LEB if possible. If no empty LEBs are available,
+ * this function searches for a non-empty data LEB. The returned LEB is marked
+ * as "taken".
+ *
+ * This function returns found LEB number in case of success, %-ENOSPC if it
+ * failed to find a LEB with @min_space bytes of free space and other a negative
+ * error codes in case of failure.
+ */
+int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
+                          int squeeze)
+{
+        const struct ubifs_lprops *lprops;
+        int lebs, rsvd_idx_lebs, pick_free = 0, err, lnum, flags;
+        dbg_find("min_space %d", min_space);
+        ubifs_get_lprops(c);
+        /* Check if there are enough empty LEBs for commit */
+        spin_lock(&c->space_lock);
+        if (c->min_idx_lebs > c->lst.idx_lebs)
+                rsvd_idx_lebs = c->min_idx_lebs -  c->lst.idx_lebs;
+        else
+                rsvd_idx_lebs = 0;
+        lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
+               c->lst.taken_empty_lebs;
+        ubifs_assert(lebs + c->lst.idx_lebs >= c->min_idx_lebs);
+        if (rsvd_idx_lebs < lebs)
+                /*
+                 * OK to allocate an empty LEB, but we still don't want to go
+                 * looking for one if there aren't any.
+                 */
+                if (c->lst.empty_lebs - c->lst.taken_empty_lebs > 0) {
+                        pick_free = 1;
+                        /*
+                         * Because we release the space lock, we must account
+                         * for this allocation here. After the LEB properties
+                         * flags have been updated, we subtract one. Note, the
+                         * result of this is that lprops also decreases
+                         * @taken_empty_lebs in 'ubifs_change_lp()', so it is
+                         * off by one for a short period of time which may
+                         * introduce a small disturbance to budgeting
+                         * calculations, but this is harmless because at the
+                         * worst case this would make the budgeting subsystem
+                         * be more pessimistic than needed.
+                         *
+                         * Fundamentally, this is about serialization of the
+                         * budgeting and lprops subsystems. We could make the
+                         * @space_lock a mutex and avoid dropping it before
+                         * calling 'ubifs_change_lp()', but mutex is more
+                         * heavy-weight, and we want budgeting to be as fast as
+                         * possible.
+                         */
+                        c->lst.taken_empty_lebs += 1;
+                }
+        spin_unlock(&c->space_lock);
+        lprops = do_find_free_space(c, min_space, pick_free, squeeze);
+        if (IS_ERR(lprops)) {
+                err = PTR_ERR(lprops);
+                goto out;
+        }
+        lnum = lprops->lnum;
+        flags = lprops->flags | LPROPS_TAKEN;
+        lprops = ubifs_change_lp(c, lprops, LPROPS_NC, LPROPS_NC, flags, 0);
+        if (IS_ERR(lprops)) {
+                err = PTR_ERR(lprops);
+                goto out;
+        }
+        if (pick_free) {
+                spin_lock(&c->space_lock);
+                c->lst.taken_empty_lebs -= 1;
+                spin_unlock(&c->space_lock);
+        }
+        *free = lprops->free;
+        ubifs_release_lprops(c);
+        if (*free == c->leb_size) {
+                /*
+                 * Ensure that empty LEBs have been unmapped. They may not have
+                 * been, for example, because of an unclean unmount.  Also
+                 * LEBs that were freeable LEBs (free + dirty == leb_size) will
+                 * not have been unmapped.
+                 */
+                err = ubifs_leb_unmap(c, lnum);
+                if (err)
+                        return err;
+        }
+        dbg_find("found LEB %d, free %d", lnum, *free);
+        ubifs_assert(*free >= min_space);
+        return lnum;
+out:
+        if (pick_free) {
+                spin_lock(&c->space_lock);
+                c->lst.taken_empty_lebs -= 1;
+                spin_unlock(&c->space_lock);
+        }
+        ubifs_release_lprops(c);
+        return err;
+}
+/**
+ * scan_for_idx_cb - callback used by the scan for a free LEB for the index.
+ * @c: the UBIFS file-system description object
+ * @lprops: LEB properties to scan
+ * @in_tree: whether the LEB properties are in main memory
+ * @data: information passed to and from the caller of the scan
+ *
+ * This function returns a code that indicates whether the scan should continue
+ * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
+ * in main memory (%LPT_SCAN_ADD), or whether the scan should stop
+ * (%LPT_SCAN_STOP).
+ */
+static int scan_for_idx_cb(struct ubifs_info *c,
+                           const struct ubifs_lprops *lprops, int in_tree,
+                           struct scan_data *data)
+{
+        int ret = LPT_SCAN_CONTINUE;
+        /* Exclude LEBs that are currently in use */
+        if (lprops->flags & LPROPS_TAKEN)
+                return LPT_SCAN_CONTINUE;
+        /* Determine whether to add these LEB properties to the tree */
+        if (!in_tree && valuable(c, lprops))
+                ret |= LPT_SCAN_ADD;
+        /* Exclude index LEBS */
+        if (lprops->flags & LPROPS_INDEX)
+                return ret;
+        /* Exclude LEBs that cannot be made empty */
+        if (lprops->free + lprops->dirty != c->leb_size)
+                return ret;
+        /*
+         * We are allocating for the index so it is safe to allocate LEBs with
+         * only free and dirty space, because write buffers are sync'd at commit
+         * start.
+         */
+        data->lnum = lprops->lnum;
+        return LPT_SCAN_ADD | LPT_SCAN_STOP;
+}
+/**
+ * scan_for_leb_for_idx - scan for a free LEB for the index.
+ * @c: the UBIFS file-system description object
+ */
+static const struct ubifs_lprops *scan_for_leb_for_idx(struct ubifs_info *c)
+{
+        struct ubifs_lprops *lprops;
+        struct scan_data data;
+        int err;
+        data.lnum = -1;
+        err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum,
+                                    (ubifs_lpt_scan_callback)scan_for_idx_cb,
+                                    &data);
+        if (err)
+                return ERR_PTR(err);
+        ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt);
+        c->lscan_lnum = data.lnum;
+        lprops = ubifs_lpt_lookup_dirty(c, data.lnum);
+        if (IS_ERR(lprops))
+                return lprops;
+        ubifs_assert(lprops->lnum == data.lnum);
+        ubifs_assert(lprops->free + lprops->dirty == c->leb_size);
+        ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+        ubifs_assert(!(lprops->flags & LPROPS_INDEX));
+        return lprops;
+}
+/**
+ * ubifs_find_free_leb_for_idx - find a free LEB for the index.
+ * @c: the UBIFS file-system description object
+ *
+ * This function looks for a free LEB and returns that LEB number. The returned
+ * LEB is marked as "taken", "index".
+ *
+ * Only empty LEBs are allocated. This is for two reasons. First, the commit
+ * calculates the number of LEBs to allocate based on the assumption that they
+ * will be empty. Secondly, free space at the end of an index LEB is not
+ * guaranteed to be empty because it may have been used by the in-the-gaps
+ * method prior to an unclean unmount.
+ *
+ * If no LEB is found %-ENOSPC is returned. For other failures another negative
+ * error code is returned.
+ */
+int ubifs_find_free_leb_for_idx(struct ubifs_info *c)
+{
+        const struct ubifs_lprops *lprops;
+        int lnum = -1, err, flags;
+        ubifs_get_lprops(c);
+        lprops = ubifs_fast_find_empty(c);
+        if (!lprops) {
+                lprops = ubifs_fast_find_freeable(c);
+                if (!lprops) {
+                        ubifs_assert(c->freeable_cnt == 0);
+                        if (c->lst.empty_lebs - c->lst.taken_empty_lebs > 0) {
+                                lprops = scan_for_leb_for_idx(c);
+                                if (IS_ERR(lprops)) {
+                                        err = PTR_ERR(lprops);
+                                        goto out;
+                                }
+                        }
+                }
+        }
+        if (!lprops) {
+                err = -ENOSPC;
+                goto out;
+        }
+        lnum = lprops->lnum;
+        dbg_find("found LEB %d, free %d, dirty %d, flags %#x",
+                 lnum, lprops->free, lprops->dirty, lprops->flags);
+        flags = lprops->flags | LPROPS_TAKEN | LPROPS_INDEX;
+        lprops = ubifs_change_lp(c, lprops, c->leb_size, 0, flags, 0);
+        if (IS_ERR(lprops)) {
+                err = PTR_ERR(lprops);
+                goto out;
+        }
+        ubifs_release_lprops(c);
+        /*
+         * Ensure that empty LEBs have been unmapped. They may not have been,
+         * for example, because of an unclean unmount. Also LEBs that were
+         * freeable LEBs (free + dirty == leb_size) will not have been unmapped.
+         */
+        err = ubifs_leb_unmap(c, lnum);
+        if (err) {
+                ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
+                                    LPROPS_TAKEN | LPROPS_INDEX, 0);
+                return err;
+        }
+        return lnum;
+out:
+        ubifs_release_lprops(c);
+        return err;
+}
+static int cmp_dirty_idx(const struct ubifs_lprops **a,
+                         const struct ubifs_lprops **b)
+{
+        const struct ubifs_lprops *lpa = *a;
+        const struct ubifs_lprops *lpb = *b;
+        return lpa->dirty + lpa->free - lpb->dirty - lpb->free;
+}
+static void swap_dirty_idx(struct ubifs_lprops **a, struct ubifs_lprops **b,
+                           int size)
+{
+        struct ubifs_lprops *t = *a;
+        *a = *b;
+        *b = t;
+}
+/**
+ * ubifs_save_dirty_idx_lnums - save an array of the most dirty index LEB nos.
+ * @c: the UBIFS file-system description object
+ *
+ * This function is called each commit to create an array of LEB numbers of
+ * dirty index LEBs sorted in order of dirty and free space.  This is used by
+ * the in-the-gaps method of TNC commit.
+ */
+int ubifs_save_dirty_idx_lnums(struct ubifs_info *c)
+{
+        int i;
+        ubifs_get_lprops(c);
+        /* Copy the LPROPS_DIRTY_IDX heap */
+        c->dirty_idx.cnt = c->lpt_heap[LPROPS_DIRTY_IDX - 1].cnt;
+        memcpy(c->dirty_idx.arr, c->lpt_heap[LPROPS_DIRTY_IDX - 1].arr,
+               sizeof(void *) * c->dirty_idx.cnt);
+        /* Sort it so that the dirtiest is now at the end */
+        sort(c->dirty_idx.arr, c->dirty_idx.cnt, sizeof(void *),
+             (int (*)(const void *, const void *))cmp_dirty_idx,
+             (void (*)(void *, void *, int))swap_dirty_idx);
+        dbg_find("found %d dirty index LEBs", c->dirty_idx.cnt);
+        if (c->dirty_idx.cnt)
+                dbg_find("dirtiest index LEB is %d with dirty %d and free %d",
+                         c->dirty_idx.arr[c->dirty_idx.cnt - 1]->lnum,
+                         c->dirty_idx.arr[c->dirty_idx.cnt - 1]->dirty,
+                         c->dirty_idx.arr[c->dirty_idx.cnt - 1]->free);
+        /* Replace the lprops pointers with LEB numbers */
+        for (i = 0; i < c->dirty_idx.cnt; i++)
+                c->dirty_idx.arr[i] = (void *)(size_t)c->dirty_idx.arr[i]->lnum;
+        ubifs_release_lprops(c);
+        return 0;
+}
+/**
+ * scan_dirty_idx_cb - callback used by the scan for a dirty index LEB.
+ * @c: the UBIFS file-system description object
+ * @lprops: LEB properties to scan
+ * @in_tree: whether the LEB properties are in main memory
+ * @data: information passed to and from the caller of the scan
+ *
+ * This function returns a code that indicates whether the scan should continue
+ * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
+ * in main memory (%LPT_SCAN_ADD), or whether the scan should stop
+ * (%LPT_SCAN_STOP).
+ */
+static int scan_dirty_idx_cb(struct ubifs_info *c,
+                           const struct ubifs_lprops *lprops, int in_tree,
+                           struct scan_data *data)
+{
+        int ret = LPT_SCAN_CONTINUE;
+        /* Exclude LEBs that are currently in use */
+        if (lprops->flags & LPROPS_TAKEN)
+                return LPT_SCAN_CONTINUE;
+        /* Determine whether to add these LEB properties to the tree */
+        if (!in_tree && valuable(c, lprops))
+                ret |= LPT_SCAN_ADD;
+        /* Exclude non-index LEBs */
+        if (!(lprops->flags & LPROPS_INDEX))
+                return ret;
+        /* Exclude LEBs with too little space */
+        if (lprops->free + lprops->dirty < c->min_idx_node_sz)
+                return ret;
+        /* Finally we found space */
+        data->lnum = lprops->lnum;
+        return LPT_SCAN_ADD | LPT_SCAN_STOP;
+}
+/**
+ * find_dirty_idx_leb - find a dirty index LEB.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns LEB number upon success and a negative error code upon
+ * failure.  In particular, -ENOSPC is returned if a dirty index LEB is not
+ * found.
+ *
+ * Note that this function scans the entire LPT but it is called very rarely.
+ */
+static int find_dirty_idx_leb(struct ubifs_info *c)
+{
+        const struct ubifs_lprops *lprops;
+        struct ubifs_lpt_heap *heap;
+        struct scan_data data;
+        int err, i, ret;
+        /* Check all structures in memory first */
+        data.lnum = -1;
+        heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1];
+        for (i = 0; i < heap->cnt; i++) {
+                lprops = heap->arr[i];
+                ret = scan_dirty_idx_cb(c, lprops, 1, &data);
+                if (ret & LPT_SCAN_STOP)
+                        goto found;
+        }
+        list_for_each_entry(lprops, &c->frdi_idx_list, list) {
+                ret = scan_dirty_idx_cb(c, lprops, 1, &data);
+                if (ret & LPT_SCAN_STOP)
+                        goto found;
+        }
+        list_for_each_entry(lprops, &c->uncat_list, list) {
+                ret = scan_dirty_idx_cb(c, lprops, 1, &data);
+                if (ret & LPT_SCAN_STOP)
+                        goto found;
+        }
+        if (c->pnodes_have >= c->pnode_cnt)
+                /* All pnodes are in memory, so skip scan */
+                return -ENOSPC;
+        err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum,
+                                    (ubifs_lpt_scan_callback)scan_dirty_idx_cb,
+                                    &data);
+        if (err)
+                return err;
+found:
+        ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt);
+        c->lscan_lnum = data.lnum;
+        lprops = ubifs_lpt_lookup_dirty(c, data.lnum);
+        if (IS_ERR(lprops))
+                return PTR_ERR(lprops);
+        ubifs_assert(lprops->lnum == data.lnum);
+        ubifs_assert(lprops->free + lprops->dirty >= c->min_idx_node_sz);
+        ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+        ubifs_assert((lprops->flags & LPROPS_INDEX));
+        dbg_find("found dirty LEB %d, free %d, dirty %d, flags %#x",
+                 lprops->lnum, lprops->free, lprops->dirty, lprops->flags);
+        lprops = ubifs_change_lp(c, lprops, LPROPS_NC, LPROPS_NC,
+                                 lprops->flags | LPROPS_TAKEN, 0);
+        if (IS_ERR(lprops))
+                return PTR_ERR(lprops);
+        return lprops->lnum;
+}
+/**
+ * get_idx_gc_leb - try to get a LEB number from trivial GC.
+ * @c: the UBIFS file-system description object
+ */
+static int get_idx_gc_leb(struct ubifs_info *c)
+{
+        const struct ubifs_lprops *lp;
+        int err, lnum;
+        err = ubifs_get_idx_gc_leb(c);
+        if (err < 0)
+                return err;
+        lnum = err;
+        /*
+         * The LEB was due to be unmapped after the commit but
+         * it is needed now for this commit.
+         */
+        lp = ubifs_lpt_lookup_dirty(c, lnum);
+        if (unlikely(IS_ERR(lp)))
+                return PTR_ERR(lp);
+        lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC,
+                             lp->flags | LPROPS_INDEX, -1);
+        if (unlikely(IS_ERR(lp)))
+                return PTR_ERR(lp);
+        dbg_find("LEB %d, dirty %d and free %d flags %#x",
+                 lp->lnum, lp->dirty, lp->free, lp->flags);
+        return lnum;
+}
+/**
+ * find_dirtiest_idx_leb - find dirtiest index LEB from dirtiest array.
+ * @c: the UBIFS file-system description object
+ */
+static int find_dirtiest_idx_leb(struct ubifs_info *c)
+{
+        const struct ubifs_lprops *lp;
+        int lnum;
+        while (1) {
+                if (!c->dirty_idx.cnt)
+                        return -ENOSPC;
+                /* The lprops pointers were replaced by LEB numbers */
+                lnum = (size_t)c->dirty_idx.arr[--c->dirty_idx.cnt];
+                lp = ubifs_lpt_lookup(c, lnum);
+                if (IS_ERR(lp))
+                        return PTR_ERR(lp);
+                if ((lp->flags & LPROPS_TAKEN) || !(lp->flags & LPROPS_INDEX))
+                        continue;
+                lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC,
+                                     lp->flags | LPROPS_TAKEN, 0);
+                if (IS_ERR(lp))
+                        return PTR_ERR(lp);
+                break;
+        }
+        dbg_find("LEB %d, dirty %d and free %d flags %#x", lp->lnum, lp->dirty,
+                 lp->free, lp->flags);
+        ubifs_assert(lp->flags | LPROPS_TAKEN);
+        ubifs_assert(lp->flags | LPROPS_INDEX);
+        return lnum;
+}
+/**
+ * ubifs_find_dirty_idx_leb - try to find dirtiest index LEB as at last commit.
+ * @c: the UBIFS file-system description object
+ *
+ * This function attempts to find an untaken index LEB with the most free and
+ * dirty space that can be used without overwriting index nodes that were in the
+ * last index committed.
+ */
+int ubifs_find_dirty_idx_leb(struct ubifs_info *c)
+{
+        int err;
+        ubifs_get_lprops(c);
+        /*
+         * We made an array of the dirtiest index LEB numbers as at the start of
+         * last commit.  Try that array first.
+         */
+        err = find_dirtiest_idx_leb(c);
+        /* Next try scanning the entire LPT */
+        if (err == -ENOSPC)
+                err = find_dirty_idx_leb(c);
+        /* Finally take any index LEBs awaiting trivial GC */
+        if (err == -ENOSPC)
+                err = get_idx_gc_leb(c);
+        ubifs_release_lprops(c);
+        return err;
+}
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
new file mode 100644
index 000000000000..d0f3dac29081
--- /dev/null
+++ b/fs/ubifs/gc.c
@@ -0,0 +1,773 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+/*
+ * This file implements garbage collection. The procedure for garbage collection
+ * is different depending on whether a LEB as an index LEB (contains index
+ * nodes) or not. For non-index LEBs, garbage collection finds a LEB which
+ * contains a lot of dirty space (obsolete nodes), and copies the non-obsolete
+ * nodes to the journal, at which point the garbage-collected LEB is free to be
+ * reused. For index LEBs, garbage collection marks the non-obsolete index nodes
+ * dirty in the TNC, and after the next commit, the garbage-collected LEB is
+ * to be reused. Garbage collection will cause the number of dirty index nodes
+ * to grow, however sufficient space is reserved for the index to ensure the
+ * commit will never run out of space.
+ */
+#include <linux/pagemap.h>
+#include "ubifs.h"
+/*
+ * GC tries to optimize the way it fit nodes to available space, and it sorts
+ * nodes a little. The below constants are watermarks which define "large",
+ * "medium", and "small" nodes.
+ */
+#define MEDIUM_NODE_WM (UBIFS_BLOCK_SIZE / 4)
+#define SMALL_NODE_WM  UBIFS_MAX_DENT_NODE_SZ
+/*
+ * GC may need to move more then one LEB to make progress. The below constants
+ * define "soft" and "hard" limits on the number of LEBs the garbage collector
+ * may move.
+ */
+#define SOFT_LEBS_LIMIT 4
+#define HARD_LEBS_LIMIT 32
+/**
+ * switch_gc_head - switch the garbage collection journal head.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to write
+ * @len: length of the buffer to write
+ * @lnum: LEB number written is returned here
+ * @offs: offset written is returned here
+ *
+ * This function switch the GC head to the next LEB which is reserved in
+ * @c->gc_lnum. Returns %0 in case of success, %-EAGAIN if commit is required,
+ * and other negative error code in case of failures.
+ */
+static int switch_gc_head(struct ubifs_info *c)
+{
+        int err, gc_lnum = c->gc_lnum;
+        struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
+        ubifs_assert(gc_lnum != -1);
+        dbg_gc("switch GC head from LEB %d:%d to LEB %d (waste %d bytes)",
+               wbuf->lnum, wbuf->offs + wbuf->used, gc_lnum,
+               c->leb_size - wbuf->offs - wbuf->used);
+        err = ubifs_wbuf_sync_nolock(wbuf);
+        if (err)
+                return err;
+        /*
+         * The GC write-buffer was synchronized, we may safely unmap
+         * 'c->gc_lnum'.
+         */
+        err = ubifs_leb_unmap(c, gc_lnum);
+        if (err)
+                return err;
+        err = ubifs_add_bud_to_log(c, GCHD, gc_lnum, 0);
+        if (err)
+                return err;
+        c->gc_lnum = -1;
+        err = ubifs_wbuf_seek_nolock(wbuf, gc_lnum, 0, UBI_LONGTERM);
+        return err;
+}
+/**
+ * move_nodes - move nodes.
+ * @c: UBIFS file-system description object
+ * @sleb: describes nodes to move
+ *
+ * This function moves valid nodes from data LEB described by @sleb to the GC
+ * journal head. The obsolete nodes are dropped.
+ *
+ * When moving nodes we have to deal with classical bin-packing problem: the
+ * space in the current GC journal head LEB and in @c->gc_lnum are the "bins",
+ * where the nodes in the @sleb->nodes list are the elements which should be
+ * fit optimally to the bins. This function uses the "first fit decreasing"
+ * strategy, although it does not really sort the nodes but just split them on
+ * 3 classes - large, medium, and small, so they are roughly sorted.
+ *
+ * This function returns zero in case of success, %-EAGAIN if commit is
+ * required, and other negative error codes in case of other failures.
+ */
+static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
+{
+        struct ubifs_scan_node *snod, *tmp;
+        struct list_head large, medium, small;
+        struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
+        int avail, err, min = INT_MAX;
+        INIT_LIST_HEAD(&large);
+        INIT_LIST_HEAD(&medium);
+        INIT_LIST_HEAD(&small);
+        list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
+                struct list_head *lst;
+                ubifs_assert(snod->type != UBIFS_IDX_NODE);
+                ubifs_assert(snod->type != UBIFS_REF_NODE);
+                ubifs_assert(snod->type != UBIFS_CS_NODE);
+                err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum,
+                                         snod->offs, 0);
+                if (err < 0)
+                        goto out;
+                lst = &snod->list;
+                list_del(lst);
+                if (!err) {
+                        /* The node is obsolete, remove it from the list */
+                        kfree(snod);
+                        continue;
+                }
+                /*
+                 * Sort the list of nodes so that large nodes go first, and
+                 * small nodes go last.
+                 */
+                if (snod->len > MEDIUM_NODE_WM)
+                        list_add(lst, &large);
+                else if (snod->len > SMALL_NODE_WM)
+                        list_add(lst, &medium);
+                else
+                        list_add(lst, &small);
+                /* And find the smallest node */
+                if (snod->len < min)
+                        min = snod->len;
+        }
+        /*
+         * Join the tree lists so that we'd have one roughly sorted list
+         * ('large' will be the head of the joined list).
+         */
+        list_splice(&medium, large.prev);
+        list_splice(&small, large.prev);
+        if (wbuf->lnum == -1) {
+                /*
+                 * The GC journal head is not set, because it is the first GC
+                 * invocation since mount.
+                 */
+                err = switch_gc_head(c);
+                if (err)
+                        goto out;
+        }
+        /* Write nodes to their new location. Use the first-fit strategy */
+        while (1) {
+                avail = c->leb_size - wbuf->offs - wbuf->used;
+                list_for_each_entry_safe(snod, tmp, &large, list) {
+                        int new_lnum, new_offs;
+                        if (avail < min)
+                                break;
+                        if (snod->len > avail)
+                                /* This node does not fit */
+                                continue;
+                        cond_resched();
+                        new_lnum = wbuf->lnum;
+                        new_offs = wbuf->offs + wbuf->used;
+                        err = ubifs_wbuf_write_nolock(wbuf, snod->node,
+                                                      snod->len);
+                        if (err)
+                                goto out;
+                        err = ubifs_tnc_replace(c, &snod->key, sleb->lnum,
+                                                snod->offs, new_lnum, new_offs,
+                                                snod->len);
+                        if (err)
+                                goto out;
+                        avail = c->leb_size - wbuf->offs - wbuf->used;
+                        list_del(&snod->list);
+                        kfree(snod);
+                }
+                if (list_empty(&large))
+                        break;
+                /*
+                 * Waste the rest of the space in the LEB and switch to the
+                 * next LEB.
+                 */
+                err = switch_gc_head(c);
+                if (err)
+                        goto out;
+        }
+        return 0;
+out:
+        list_for_each_entry_safe(snod, tmp, &large, list) {
+                list_del(&snod->list);
+                kfree(snod);
+        }
+        return err;
+}
+/**
+ * gc_sync_wbufs - sync write-buffers for GC.
+ * @c: UBIFS file-system description object
+ *
+ * We must guarantee that obsoleting nodes are on flash. Unfortunately they may
+ * be in a write-buffer instead. That is, a node could be written to a
+ * write-buffer, obsoleting another node in a LEB that is GC'd. If that LEB is
+ * erased before the write-buffer is sync'd and then there is an unclean
+ * unmount, then an existing node is lost. To avoid this, we sync all
+ * write-buffers.
+ *
+ * This function returns %0 on success or a negative error code on failure.
+ */
+static int gc_sync_wbufs(struct ubifs_info *c)
+{
+        int err, i;
+        for (i = 0; i < c->jhead_cnt; i++) {
+                if (i == GCHD)
+                        continue;
+                err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
+                if (err)
+                        return err;
+        }
+        return 0;
+}
+/**
+ * ubifs_garbage_collect_leb - garbage-collect a logical eraseblock.
+ * @c: UBIFS file-system description object
+ * @lp: describes the LEB to garbage collect
+ *
+ * This function garbage-collects an LEB and returns one of the @LEB_FREED,
+ * @LEB_RETAINED, etc positive codes in case of success, %-EAGAIN if commit is
+ * required, and other negative error codes in case of failures.
+ */
+int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
+{
+        struct ubifs_scan_leb *sleb;
+        struct ubifs_scan_node *snod;
+        struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
+        int err = 0, lnum = lp->lnum;
+        ubifs_assert(c->gc_lnum != -1 || wbuf->offs + wbuf->used == 0 ||
+                     c->need_recovery);
+        ubifs_assert(c->gc_lnum != lnum);
+        ubifs_assert(wbuf->lnum != lnum);
+        /*
+         * We scan the entire LEB even though we only really need to scan up to
+         * (c->leb_size - lp->free).
+         */
+        sleb = ubifs_scan(c, lnum, 0, c->sbuf);
+        if (IS_ERR(sleb))
+                return PTR_ERR(sleb);
+        ubifs_assert(!list_empty(&sleb->nodes));
+        snod = list_entry(sleb->nodes.next, struct ubifs_scan_node, list);
+        if (snod->type == UBIFS_IDX_NODE) {
+                struct ubifs_gced_idx_leb *idx_gc;
+                dbg_gc("indexing LEB %d (free %d, dirty %d)",
+                       lnum, lp->free, lp->dirty);
+                list_for_each_entry(snod, &sleb->nodes, list) {
+                        struct ubifs_idx_node *idx = snod->node;
+                        int level = le16_to_cpu(idx->level);
+                        ubifs_assert(snod->type == UBIFS_IDX_NODE);
+                        key_read(c, ubifs_idx_key(c, idx), &snod->key);
+                        err = ubifs_dirty_idx_node(c, &snod->key, level, lnum,
+                                                   snod->offs);
+                        if (err)
+                                goto out;
+                }
+                idx_gc = kmalloc(sizeof(struct ubifs_gced_idx_leb), GFP_NOFS);
+                if (!idx_gc) {
+                        err = -ENOMEM;
+                        goto out;
+                }
+                idx_gc->lnum = lnum;
+                idx_gc->unmap = 0;
+                list_add(&idx_gc->list, &c->idx_gc);
+                /*
+                 * Don't release the LEB until after the next commit, because
+                 * it may contain date which is needed for recovery. So
+                 * although we freed this LEB, it will become usable only after
+                 * the commit.
+                 */
+                err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0,
+                                          LPROPS_INDEX, 1);
+                if (err)
+                        goto out;
+                err = LEB_FREED_IDX;
+        } else {
+                dbg_gc("data LEB %d (free %d, dirty %d)",
+                       lnum, lp->free, lp->dirty);
+                err = move_nodes(c, sleb);
+                if (err)
+                        goto out;
+                err = gc_sync_wbufs(c);
+                if (err)
+                        goto out;
+                err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0, 0, 0);
+                if (err)
+                        goto out;
+                if (c->gc_lnum == -1) {
+                        c->gc_lnum = lnum;
+                        err = LEB_RETAINED;
+                } else {
+                        err = ubifs_wbuf_sync_nolock(wbuf);
+                        if (err)
+                                goto out;
+                        err = ubifs_leb_unmap(c, lnum);
+                        if (err)
+                                goto out;
+                        err = LEB_FREED;
+                }
+        }
+out:
+        ubifs_scan_destroy(sleb);
+        return err;
+}
+/**
+ * ubifs_garbage_collect - UBIFS garbage collector.
+ * @c: UBIFS file-system description object
+ * @anyway: do GC even if there are free LEBs
+ *
+ * This function does out-of-place garbage collection. The return codes are:
+ *   o positive LEB number if the LEB has been freed and may be used;
+ *   o %-EAGAIN if the caller has to run commit;
+ *   o %-ENOSPC if GC failed to make any progress;
+ *   o other negative error codes in case of other errors.
+ *
+ * Garbage collector writes data to the journal when GC'ing data LEBs, and just
+ * marking indexing nodes dirty when GC'ing indexing LEBs. Thus, at some point
+ * commit may be required. But commit cannot be run from inside GC, because the
+ * caller might be holding the commit lock, so %-EAGAIN is returned instead;
+ * And this error code means that the caller has to run commit, and re-run GC
+ * if there is still no free space.
+ *
+ * There are many reasons why this function may return %-EAGAIN:
+ * o the log is full and there is no space to write an LEB reference for
+ *   @c->gc_lnum;
+ * o the journal is too large and exceeds size limitations;
+ * o GC moved indexing LEBs, but they can be used only after the commit;
+ * o the shrinker fails to find clean znodes to free and requests the commit;
+ * o etc.
+ *
+ * Note, if the file-system is close to be full, this function may return
+ * %-EAGAIN infinitely, so the caller has to limit amount of re-invocations of
+ * the function. E.g., this happens if the limits on the journal size are too
+ * tough and GC writes too much to the journal before an LEB is freed. This
+ * might also mean that the journal is too large, and the TNC becomes to big,
+ * so that the shrinker is constantly called, finds not clean znodes to free,
+ * and requests commit. Well, this may also happen if the journal is all right,
+ * but another kernel process consumes too much memory. Anyway, infinite
+ * %-EAGAIN may happen, but in some extreme/misconfiguration cases.
+ */
+int ubifs_garbage_collect(struct ubifs_info *c, int anyway)
+{
+        int i, err, ret, min_space = c->dead_wm;
+        struct ubifs_lprops lp;
+        struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
+        ubifs_assert_cmt_locked(c);
+        if (ubifs_gc_should_commit(c))
+                return -EAGAIN;
+        mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+        if (c->ro_media) {
+                ret = -EROFS;
+                goto out_unlock;
+        }
+        /* We expect the write-buffer to be empty on entry */
+        ubifs_assert(!wbuf->used);
+        for (i = 0; ; i++) {
+                int space_before = c->leb_size - wbuf->offs - wbuf->used;
+                int space_after;
+                cond_resched();
+                /* Give the commit an opportunity to run */
+                if (ubifs_gc_should_commit(c)) {
+                        ret = -EAGAIN;
+                        break;
+                }
+                if (i > SOFT_LEBS_LIMIT && !list_empty(&c->idx_gc)) {
+                        /*
+                         * We've done enough iterations. Indexing LEBs were
+                         * moved and will be available after the commit.
+                         */
+                        dbg_gc("soft limit, some index LEBs GC'ed, -EAGAIN");
+                        ubifs_commit_required(c);
+                        ret = -EAGAIN;
+                        break;
+                }
+                if (i > HARD_LEBS_LIMIT) {
+                        /*
+                         * We've moved too many LEBs and have not made
+                         * progress, give up.
+                         */
+                        dbg_gc("hard limit, -ENOSPC");
+                        ret = -ENOSPC;
+                        break;
+                }
+                /*
+                 * Empty and freeable LEBs can turn up while we waited for
+                 * the wbuf lock, or while we have been running GC. In that
+                 * case, we should just return one of those instead of
+                 * continuing to GC dirty LEBs. Hence we request
+                 * 'ubifs_find_dirty_leb()' to return an empty LEB if it can.
+                 */
+                ret = ubifs_find_dirty_leb(c, &lp, min_space, anyway ? 0 : 1);
+                if (ret) {
+                        if (ret == -ENOSPC)
+                                dbg_gc("no more dirty LEBs");
+                        break;
+                }
+                dbg_gc("found LEB %d: free %d, dirty %d, sum %d "
+                       "(min. space %d)", lp.lnum, lp.free, lp.dirty,
+                       lp.free + lp.dirty, min_space);
+                if (lp.free + lp.dirty == c->leb_size) {
+                        /* An empty LEB was returned */
+                        dbg_gc("LEB %d is free, return it", lp.lnum);
+                        /*
+                         * ubifs_find_dirty_leb() doesn't return freeable index
+                         * LEBs.
+                         */
+                        ubifs_assert(!(lp.flags & LPROPS_INDEX));
+                        if (lp.free != c->leb_size) {
+                                /*
+                                 * Write buffers must be sync'd before
+                                 * unmapping freeable LEBs, because one of them
+                                 * may contain data which obsoletes something
+                                 * in 'lp.pnum'.
+                                 */
+                                ret = gc_sync_wbufs(c);
+                                if (ret)
+                                        goto out;
+                                ret = ubifs_change_one_lp(c, lp.lnum,
+                                                          c->leb_size, 0, 0, 0,
+                                                          0);
+                                if (ret)
+                                        goto out;
+                        }
+                        ret = ubifs_leb_unmap(c, lp.lnum);
+                        if (ret)
+                                goto out;
+                        ret = lp.lnum;
+                        break;
+                }
+                space_before = c->leb_size - wbuf->offs - wbuf->used;
+                if (wbuf->lnum == -1)
+                        space_before = 0;
+                ret = ubifs_garbage_collect_leb(c, &lp);
+                if (ret < 0) {
+                        if (ret == -EAGAIN || ret == -ENOSPC) {
+                                /*
+                                 * These codes are not errors, so we have to
+                                 * return the LEB to lprops. But if the
+                                 * 'ubifs_return_leb()' function fails, its
+                                 * failure code is propagated to the caller
+                                 * instead of the original '-EAGAIN' or
+                                 * '-ENOSPC'.
+                                 */
+                                err = ubifs_return_leb(c, lp.lnum);
+                                if (err)
+                                        ret = err;
+                                break;
+                        }
+                        goto out;
+                }
+                if (ret == LEB_FREED) {
+                        /* An LEB has been freed and is ready for use */
+                        dbg_gc("LEB %d freed, return", lp.lnum);
+                        ret = lp.lnum;
+                        break;
+                }
+                if (ret == LEB_FREED_IDX) {
+                        /*
+                         * This was an indexing LEB and it cannot be
+                         * immediately used. And instead of requesting the
+                         * commit straight away, we try to garbage collect some
+                         * more.
+                         */
+                        dbg_gc("indexing LEB %d freed, continue", lp.lnum);
+                        continue;
+                }
+                ubifs_assert(ret == LEB_RETAINED);
+                space_after = c->leb_size - wbuf->offs - wbuf->used;
+                dbg_gc("LEB %d retained, freed %d bytes", lp.lnum,
+                       space_after - space_before);
+                if (space_after > space_before) {
+                        /* GC makes progress, keep working */
+                        min_space >>= 1;
+                        if (min_space < c->dead_wm)
+                                min_space = c->dead_wm;
+                        continue;
+                }
+                dbg_gc("did not make progress");
+                /*
+                 * GC moved an LEB bud have not done any progress. This means
+                 * that the previous GC head LEB contained too few free space
+                 * and the LEB which was GC'ed contained only large nodes which
+                 * did not fit that space.
+                 *
+                 * We can do 2 things:
+                 * 1. pick another LEB in a hope it'll contain a small node
+                 *    which will fit the space we have at the end of current GC
+                 *    head LEB, but there is no guarantee, so we try this out
+                 *    unless we have already been working for too long;
+                 * 2. request an LEB with more dirty space, which will force
+                 *    'ubifs_find_dirty_leb()' to start scanning the lprops
+                 *    table, instead of just picking one from the heap
+                 *    (previously it already picked the dirtiest LEB).
+                 */
+                if (i < SOFT_LEBS_LIMIT) {
+                        dbg_gc("try again");
+                        continue;
+                }
+                min_space <<= 1;
+                if (min_space > c->dark_wm)
+                        min_space = c->dark_wm;
+                dbg_gc("set min. space to %d", min_space);
+        }
+        if (ret == -ENOSPC && !list_empty(&c->idx_gc)) {
+                dbg_gc("no space, some index LEBs GC'ed, -EAGAIN");
+                ubifs_commit_required(c);
+                ret = -EAGAIN;
+        }
+        err = ubifs_wbuf_sync_nolock(wbuf);
+        if (!err)
+                err = ubifs_leb_unmap(c, c->gc_lnum);
+        if (err) {
+                ret = err;
+                goto out;
+        }
+out_unlock:
+        mutex_unlock(&wbuf->io_mutex);
+        return ret;
+out:
+        ubifs_assert(ret < 0);
+        ubifs_assert(ret != -ENOSPC && ret != -EAGAIN);
+        ubifs_ro_mode(c, ret);
+        ubifs_wbuf_sync_nolock(wbuf);
+        mutex_unlock(&wbuf->io_mutex);
+        ubifs_return_leb(c, lp.lnum);
+        return ret;
+}
+/**
+ * ubifs_gc_start_commit - garbage collection at start of commit.
+ * @c: UBIFS file-system description object
+ *
+ * If a LEB has only dirty and free space, then we may safely unmap it and make
+ * it free.  Note, we cannot do this with indexing LEBs because dirty space may
+ * correspond index nodes that are required for recovery.  In that case, the
+ * LEB cannot be unmapped until after the next commit.
+ *
+ * This function returns %0 upon success and a negative error code upon failure.
+ */
+int ubifs_gc_start_commit(struct ubifs_info *c)
+{
+        struct ubifs_gced_idx_leb *idx_gc;
+        const struct ubifs_lprops *lp;
+        int err = 0, flags;
+        ubifs_get_lprops(c);
+        /*
+         * Unmap (non-index) freeable LEBs. Note that recovery requires that all
+         * wbufs are sync'd before this, which is done in 'do_commit()'.
+         */
+        while (1) {
+                lp = ubifs_fast_find_freeable(c);
+                if (unlikely(IS_ERR(lp))) {
+                        err = PTR_ERR(lp);
+                        goto out;
+                }
+                if (!lp)
+                        break;
+                ubifs_assert(!(lp->flags & LPROPS_TAKEN));
+                ubifs_assert(!(lp->flags & LPROPS_INDEX));
+                err = ubifs_leb_unmap(c, lp->lnum);
+                if (err)
+                        goto out;
+                lp = ubifs_change_lp(c, lp, c->leb_size, 0, lp->flags, 0);
+                if (unlikely(IS_ERR(lp))) {
+                        err = PTR_ERR(lp);
+                        goto out;
+                }
+                ubifs_assert(!(lp->flags & LPROPS_TAKEN));
+                ubifs_assert(!(lp->flags & LPROPS_INDEX));
+        }
+        /* Mark GC'd index LEBs OK to unmap after this commit finishes */
+        list_for_each_entry(idx_gc, &c->idx_gc, list)
+                idx_gc->unmap = 1;
+        /* Record index freeable LEBs for unmapping after commit */
+        while (1) {
+                lp = ubifs_fast_find_frdi_idx(c);
+                if (unlikely(IS_ERR(lp))) {
+                        err = PTR_ERR(lp);
+                        goto out;
+                }
+                if (!lp)
+                        break;
+                idx_gc = kmalloc(sizeof(struct ubifs_gced_idx_leb), GFP_NOFS);
+                if (!idx_gc) {
+                        err = -ENOMEM;
+                        goto out;
+                }
+                ubifs_assert(!(lp->flags & LPROPS_TAKEN));
+                ubifs_assert(lp->flags & LPROPS_INDEX);
+                /* Don't release the LEB until after the next commit */
+                flags = (lp->flags | LPROPS_TAKEN) ^ LPROPS_INDEX;
+                lp = ubifs_change_lp(c, lp, c->leb_size, 0, flags, 1);
+                if (unlikely(IS_ERR(lp))) {
+                        err = PTR_ERR(lp);
+                        kfree(idx_gc);
+                        goto out;
+                }
+                ubifs_assert(lp->flags & LPROPS_TAKEN);
+                ubifs_assert(!(lp->flags & LPROPS_INDEX));
+                idx_gc->lnum = lp->lnum;
+                idx_gc->unmap = 1;
+                list_add(&idx_gc->list, &c->idx_gc);
+        }
+out:
+        ubifs_release_lprops(c);
+        return err;
+}
+/**
+ * ubifs_gc_end_commit - garbage collection at end of commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function completes out-of-place garbage collection of index LEBs.
+ */
+int ubifs_gc_end_commit(struct ubifs_info *c)
+{
+        struct ubifs_gced_idx_leb *idx_gc, *tmp;
+        struct ubifs_wbuf *wbuf;
+        int err = 0;
+        wbuf = &c->jheads[GCHD].wbuf;
+        mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+        list_for_each_entry_safe(idx_gc, tmp, &c->idx_gc, list)
+                if (idx_gc->unmap) {
+                        dbg_gc("LEB %d", idx_gc->lnum);
+                        err = ubifs_leb_unmap(c, idx_gc->lnum);
+                        if (err)
+                                goto out;
+                        err = ubifs_change_one_lp(c, idx_gc->lnum, LPROPS_NC,
+                                          LPROPS_NC, 0, LPROPS_TAKEN, -1);
+                        if (err)
+                                goto out;
+                        list_del(&idx_gc->list);
+                        kfree(idx_gc);
+                }
+out:
+        mutex_unlock(&wbuf->io_mutex);
+        return err;
+}
+/**
+ * ubifs_destroy_idx_gc - destroy idx_gc list.
+ * @c: UBIFS file-system description object
+ *
+ * This function destroys the idx_gc list. It is called when unmounting or
+ * remounting read-only so locks are not needed.
+ */
+void ubifs_destroy_idx_gc(struct ubifs_info *c)
+{
+        while (!list_empty(&c->idx_gc)) {
+                struct ubifs_gced_idx_leb *idx_gc;
+                idx_gc = list_entry(c->idx_gc.next, struct ubifs_gced_idx_leb,
+                                    list);
+                c->idx_gc_cnt -= 1;
+                list_del(&idx_gc->list);
+                kfree(idx_gc);
+        }
+}
+/**
+ * ubifs_get_idx_gc_leb - get a LEB from GC'd index LEB list.
+ * @c: UBIFS file-system description object
+ *
+ * Called during start commit so locks are not needed.
+ */
+int ubifs_get_idx_gc_leb(struct ubifs_info *c)
+{
+        struct ubifs_gced_idx_leb *idx_gc;
+        int lnum;
+        if (list_empty(&c->idx_gc))
+                return -ENOSPC;
+        idx_gc = list_entry(c->idx_gc.next, struct ubifs_gced_idx_leb, list);
+        lnum = idx_gc->lnum;
+        /* c->idx_gc_cnt is updated by the caller when lprops are updated */
+        list_del(&idx_gc->list);
+        kfree(idx_gc);
+        return lnum;
+}
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
new file mode 100644
index 000000000000..3374f91b6709
--- /dev/null
+++ b/fs/ubifs/io.c
@@ -0,0 +1,914 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ * Copyright (C) 2006, 2007 University of Szeged, Hungary
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ *          Zoltan Sogor
+ */
+/*
+ * This file implements UBIFS I/O subsystem which provides various I/O-related
+ * helper functions (reading/writing/checking/validating nodes) and implements
+ * write-buffering support. Write buffers help to save space which otherwise
+ * would have been wasted for padding to the nearest minimal I/O unit boundary.
+ * Instead, data first goes to the write-buffer and is flushed when the
+ * buffer is full or when it is not used for some time (by timer). This is
+ * similarto the mechanism is used by JFFS2.
+ *
+ * Write-buffers are defined by 'struct ubifs_wbuf' objects and protected by
+ * mutexes defined inside these objects. Since sometimes upper-level code
+ * has to lock the write-buffer (e.g. journal space reservation code), many
+ * functions related to write-buffers have "nolock" suffix which means that the
+ * caller has to lock the write-buffer before calling this function.
+ *
+ * UBIFS stores nodes at 64 bit-aligned addresses. If the node length is not
+ * aligned, UBIFS starts the next node from the aligned address, and the padded
+ * bytes may contain any rubbish. In other words, UBIFS does not put padding
+ * bytes in those small gaps. Common headers of nodes store real node lengths,
+ * not aligned lengths. Indexing nodes also store real lengths in branches.
+ *
+ * UBIFS uses padding when it pads to the next min. I/O unit. In this case it
+ * uses padding nodes or padding bytes, if the padding node does not fit.
+ *
+ * All UBIFS nodes are protected by CRC checksums and UBIFS checks all nodes
+ * every time they are read from the flash media.
+ */
+#include <linux/crc32.h>
+#include "ubifs.h"
+/**
+ * ubifs_check_node - check node.
+ * @c: UBIFS file-system description object
+ * @buf: node to check
+ * @lnum: logical eraseblock number
+ * @offs: offset within the logical eraseblock
+ * @quiet: print no messages
+ *
+ * This function checks node magic number and CRC checksum. This function also
+ * validates node length to prevent UBIFS from becoming crazy when an attacker
+ * feeds it a file-system image with incorrect nodes. For example, too large
+ * node length in the common header could cause UBIFS to read memory outside of
+ * allocated buffer when checking the CRC checksum.
+ *
+ * This function returns zero in case of success %-EUCLEAN in case of bad CRC
+ * or magic.
+ */
+int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
+                     int offs, int quiet)
+{
+        int err = -EINVAL, type, node_len;
+        uint32_t crc, node_crc, magic;
+        const struct ubifs_ch *ch = buf;
+        ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
+        ubifs_assert(!(offs & 7) && offs < c->leb_size);
+        magic = le32_to_cpu(ch->magic);
+        if (magic != UBIFS_NODE_MAGIC) {
+                if (!quiet)
+                        ubifs_err("bad magic %#08x, expected %#08x",
+                                  magic, UBIFS_NODE_MAGIC);
+                err = -EUCLEAN;
+                goto out;
+        }
+        type = ch->node_type;
+        if (type < 0 || type >= UBIFS_NODE_TYPES_CNT) {
+                if (!quiet)
+                        ubifs_err("bad node type %d", type);
+                goto out;
+        }
+        node_len = le32_to_cpu(ch->len);
+        if (node_len + offs > c->leb_size)
+                goto out_len;
+        if (c->ranges[type].max_len == 0) {
+                if (node_len != c->ranges[type].len)
+                        goto out_len;
+        } else if (node_len < c->ranges[type].min_len ||
+                   node_len > c->ranges[type].max_len)
+                goto out_len;
+        crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
+        node_crc = le32_to_cpu(ch->crc);
+        if (crc != node_crc) {
+                if (!quiet)
+                        ubifs_err("bad CRC: calculated %#08x, read %#08x",
+                                  crc, node_crc);
+                err = -EUCLEAN;
+                goto out;
+        }
+        return 0;
+out_len:
+        if (!quiet)
+                ubifs_err("bad node length %d", node_len);
+out:
+        if (!quiet) {
+                ubifs_err("bad node at LEB %d:%d", lnum, offs);
+                dbg_dump_node(c, buf);
+                dbg_dump_stack();
+        }
+        return err;
+}
+/**
+ * ubifs_pad - pad flash space.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to put padding to
+ * @pad: how many bytes to pad
+ *
+ * The flash media obliges us to write only in chunks of %c->min_io_size and
+ * when we have to write less data we add padding node to the write-buffer and
+ * pad it to the next minimal I/O unit's boundary. Padding nodes help when the
+ * media is being scanned. If the amount of wasted space is not enough to fit a
+ * padding node which takes %UBIFS_PAD_NODE_SZ bytes, we write padding bytes
+ * pattern (%UBIFS_PADDING_BYTE).
+ *
+ * Padding nodes are also used to fill gaps when the "commit-in-gaps" method is
+ * used.
+ */
+void ubifs_pad(const struct ubifs_info *c, void *buf, int pad)
+{
+        uint32_t crc;
+        ubifs_assert(pad >= 0 && !(pad & 7));
+        if (pad >= UBIFS_PAD_NODE_SZ) {
+                struct ubifs_ch *ch = buf;
+                struct ubifs_pad_node *pad_node = buf;
+                ch->magic = cpu_to_le32(UBIFS_NODE_MAGIC);
+                ch->node_type = UBIFS_PAD_NODE;
+                ch->group_type = UBIFS_NO_NODE_GROUP;
+                ch->padding[0] = ch->padding[1] = 0;
+                ch->sqnum = 0;
+                ch->len = cpu_to_le32(UBIFS_PAD_NODE_SZ);
+                pad -= UBIFS_PAD_NODE_SZ;
+                pad_node->pad_len = cpu_to_le32(pad);
+                crc = crc32(UBIFS_CRC32_INIT, buf + 8, UBIFS_PAD_NODE_SZ - 8);
+                ch->crc = cpu_to_le32(crc);
+                memset(buf + UBIFS_PAD_NODE_SZ, 0, pad);
+        } else if (pad > 0)
+                /* Too little space, padding node won't fit */
+                memset(buf, UBIFS_PADDING_BYTE, pad);
+}
+/**
+ * next_sqnum - get next sequence number.
+ * @c: UBIFS file-system description object
+ */
+static unsigned long long next_sqnum(struct ubifs_info *c)
+{
+        unsigned long long sqnum;
+        spin_lock(&c->cnt_lock);
+        sqnum = ++c->max_sqnum;
+        spin_unlock(&c->cnt_lock);
+        if (unlikely(sqnum >= SQNUM_WARN_WATERMARK)) {
+                if (sqnum >= SQNUM_WATERMARK) {
+                        ubifs_err("sequence number overflow %llu, end of life",
+                                  sqnum);
+                        ubifs_ro_mode(c, -EINVAL);
+                }
+                ubifs_warn("running out of sequence numbers, end of life soon");
+        }
+        return sqnum;
+}
+/**
+ * ubifs_prepare_node - prepare node to be written to flash.
+ * @c: UBIFS file-system description object
+ * @node: the node to pad
+ * @len: node length
+ * @pad: if the buffer has to be padded
+ *
+ * This function prepares node at @node to be written to the media - it
+ * calculates node CRC, fills the common header, and adds proper padding up to
+ * the next minimum I/O unit if @pad is not zero.
+ */
+void ubifs_prepare_node(struct ubifs_info *c, void *node, int len, int pad)
+{
+        uint32_t crc;
+        struct ubifs_ch *ch = node;
+        unsigned long long sqnum = next_sqnum(c);
+        ubifs_assert(len >= UBIFS_CH_SZ);
+        ch->magic = cpu_to_le32(UBIFS_NODE_MAGIC);
+        ch->len = cpu_to_le32(len);
+        ch->group_type = UBIFS_NO_NODE_GROUP;
+        ch->sqnum = cpu_to_le64(sqnum);
+        ch->padding[0] = ch->padding[1] = 0;
+        crc = crc32(UBIFS_CRC32_INIT, node + 8, len - 8);
+        ch->crc = cpu_to_le32(crc);
+        if (pad) {
+                len = ALIGN(len, 8);
+                pad = ALIGN(len, c->min_io_size) - len;
+                ubifs_pad(c, node + len, pad);
+        }
+}
+/**
+ * ubifs_prep_grp_node - prepare node of a group to be written to flash.
+ * @c: UBIFS file-system description object
+ * @node: the node to pad
+ * @len: node length
+ * @last: indicates the last node of the group
+ *
+ * This function prepares node at @node to be written to the media - it
+ * calculates node CRC and fills the common header.
+ */
+void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last)
+{
+        uint32_t crc;
+        struct ubifs_ch *ch = node;
+        unsigned long long sqnum = next_sqnum(c);
+        ubifs_assert(len >= UBIFS_CH_SZ);
+        ch->magic = cpu_to_le32(UBIFS_NODE_MAGIC);
+        ch->len = cpu_to_le32(len);
+        if (last)
+                ch->group_type = UBIFS_LAST_OF_NODE_GROUP;
+        else
+                ch->group_type = UBIFS_IN_NODE_GROUP;
+        ch->sqnum = cpu_to_le64(sqnum);
+        ch->padding[0] = ch->padding[1] = 0;
+        crc = crc32(UBIFS_CRC32_INIT, node + 8, len - 8);
+        ch->crc = cpu_to_le32(crc);
+}
+/**
+ * wbuf_timer_callback - write-buffer timer callback function.
+ * @data: timer data (write-buffer descriptor)
+ *
+ * This function is called when the write-buffer timer expires.
+ */
+static void wbuf_timer_callback_nolock(unsigned long data)
+{
+        struct ubifs_wbuf *wbuf = (struct ubifs_wbuf *)data;
+        wbuf->need_sync = 1;
+        wbuf->c->need_wbuf_sync = 1;
+        ubifs_wake_up_bgt(wbuf->c);
+}
+/**
+ * new_wbuf_timer - start new write-buffer timer.
+ * @wbuf: write-buffer descriptor
+ */
+static void new_wbuf_timer_nolock(struct ubifs_wbuf *wbuf)
+{
+        ubifs_assert(!timer_pending(&wbuf->timer));
+        if (!wbuf->timeout)
+                return;
+        wbuf->timer.expires = jiffies + wbuf->timeout;
+        add_timer(&wbuf->timer);
+}
+/**
+ * cancel_wbuf_timer - cancel write-buffer timer.
+ * @wbuf: write-buffer descriptor
+ */
+static void cancel_wbuf_timer_nolock(struct ubifs_wbuf *wbuf)
+{
+        /*
+         * If the syncer is waiting for the lock (from the background thread's
+         * context) and another task is changing write-buffer then the syncing
+         * should be canceled.
+         */
+        wbuf->need_sync = 0;
+        del_timer(&wbuf->timer);
+}
+/**
+ * ubifs_wbuf_sync_nolock - synchronize write-buffer.
+ * @wbuf: write-buffer to synchronize
+ *
+ * This function synchronizes write-buffer @buf and returns zero in case of
+ * success or a negative error code in case of failure.
+ */
+int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
+{
+        struct ubifs_info *c = wbuf->c;
+        int err, dirt;
+        cancel_wbuf_timer_nolock(wbuf);
+        if (!wbuf->used || wbuf->lnum == -1)
+                /* Write-buffer is empty or not seeked */
+                return 0;
+        dbg_io("LEB %d:%d, %d bytes",
+               wbuf->lnum, wbuf->offs, wbuf->used);
+        ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY));
+        ubifs_assert(!(wbuf->avail & 7));
+        ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size);
+        if (c->ro_media)
+                return -EROFS;
+        ubifs_pad(c, wbuf->buf + wbuf->used, wbuf->avail);
+        err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,
+                            c->min_io_size, wbuf->dtype);
+        if (err) {
+                ubifs_err("cannot write %d bytes to LEB %d:%d",
+                          c->min_io_size, wbuf->lnum, wbuf->offs);
+                dbg_dump_stack();
+                return err;
+        }
+        dirt = wbuf->avail;
+        spin_lock(&wbuf->lock);
+        wbuf->offs += c->min_io_size;
+        wbuf->avail = c->min_io_size;
+        wbuf->used = 0;
+        wbuf->next_ino = 0;
+        spin_unlock(&wbuf->lock);
+        if (wbuf->sync_callback)
+                err = wbuf->sync_callback(c, wbuf->lnum,
+                                          c->leb_size - wbuf->offs, dirt);
+        return err;
+}
+/**
+ * ubifs_wbuf_seek_nolock - seek write-buffer.
+ * @wbuf: write-buffer
+ * @lnum: logical eraseblock number to seek to
+ * @offs: logical eraseblock offset to seek to
+ * @dtype: data type
+ *
+ * This function targets the write buffer to logical eraseblock @lnum:@offs.
+ * The write-buffer is synchronized if it is not empty. Returns zero in case of
+ * success and a negative error code in case of failure.
+ */
+int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
+                           int dtype)
+{
+        const struct ubifs_info *c = wbuf->c;
+        dbg_io("LEB %d:%d", lnum, offs);
+        ubifs_assert(lnum >= 0 && lnum < c->leb_cnt);
+        ubifs_assert(offs >= 0 && offs <= c->leb_size);
+        ubifs_assert(offs % c->min_io_size == 0 && !(offs & 7));
+        ubifs_assert(lnum != wbuf->lnum);
+        if (wbuf->used > 0) {
+                int err = ubifs_wbuf_sync_nolock(wbuf);
+                if (err)
+                        return err;
+        }
+        spin_lock(&wbuf->lock);
+        wbuf->lnum = lnum;
+        wbuf->offs = offs;
+        wbuf->avail = c->min_io_size;
+        wbuf->used = 0;
+        spin_unlock(&wbuf->lock);
+        wbuf->dtype = dtype;
+        return 0;
+}
+/**
+ * ubifs_bg_wbufs_sync - synchronize write-buffers.
+ * @c: UBIFS file-system description object
+ *
+ * This function is called by background thread to synchronize write-buffers.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+int ubifs_bg_wbufs_sync(struct ubifs_info *c)
+{
+        int err, i;
+        if (!c->need_wbuf_sync)
+                return 0;
+        c->need_wbuf_sync = 0;
+        if (c->ro_media) {
+                err = -EROFS;
+                goto out_timers;
+        }
+        dbg_io("synchronize");
+        for (i = 0; i < c->jhead_cnt; i++) {
+                struct ubifs_wbuf *wbuf = &c->jheads[i].wbuf;
+                cond_resched();
+                /*
+                 * If the mutex is locked then wbuf is being changed, so
+                 * synchronization is not necessary.
+                 */
+                if (mutex_is_locked(&wbuf->io_mutex))
+                        continue;
+                mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+                if (!wbuf->need_sync) {
+                        mutex_unlock(&wbuf->io_mutex);
+                        continue;
+                }
+                err = ubifs_wbuf_sync_nolock(wbuf);
+                mutex_unlock(&wbuf->io_mutex);
+                if (err) {
+                        ubifs_err("cannot sync write-buffer, error %d", err);
+                        ubifs_ro_mode(c, err);
+                        goto out_timers;
+                }
+        }
+        return 0;
+out_timers:
+        /* Cancel all timers to prevent repeated errors */
+        for (i = 0; i < c->jhead_cnt; i++) {
+                struct ubifs_wbuf *wbuf = &c->jheads[i].wbuf;
+                mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+                cancel_wbuf_timer_nolock(wbuf);
+                mutex_unlock(&wbuf->io_mutex);
+        }
+        return err;
+}
+/**
+ * ubifs_wbuf_write_nolock - write data to flash via write-buffer.
+ * @wbuf: write-buffer
+ * @buf: node to write
+ * @len: node length
+ *
+ * This function writes data to flash via write-buffer @wbuf. This means that
+ * the last piece of the node won't reach the flash media immediately if it
+ * does not take whole minimal I/O unit. Instead, the node will sit in RAM
+ * until the write-buffer is synchronized (e.g., by timer).
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure. If the node cannot be written because there is no more
+ * space in this logical eraseblock, %-ENOSPC is returned.
+ */
+int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
+{
+        struct ubifs_info *c = wbuf->c;
+        int err, written, n, aligned_len = ALIGN(len, 8), offs;
+        dbg_io("%d bytes (%s) to wbuf at LEB %d:%d", len,
+               dbg_ntype(((struct ubifs_ch *)buf)->node_type), wbuf->lnum,
+               wbuf->offs + wbuf->used);
+        ubifs_assert(len > 0 && wbuf->lnum >= 0 && wbuf->lnum < c->leb_cnt);
+        ubifs_assert(wbuf->offs >= 0 && wbuf->offs % c->min_io_size == 0);
+        ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size);
+        ubifs_assert(wbuf->avail > 0 && wbuf->avail <= c->min_io_size);
+        ubifs_assert(mutex_is_locked(&wbuf->io_mutex));
+        if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) {
+                err = -ENOSPC;
+                goto out;
+        }
+        cancel_wbuf_timer_nolock(wbuf);
+        if (c->ro_media)
+                return -EROFS;
+        if (aligned_len <= wbuf->avail) {
+                /*
+                 * The node is not very large and fits entirely within
+                 * write-buffer.
+                 */
+                memcpy(wbuf->buf + wbuf->used, buf, len);
+                if (aligned_len == wbuf->avail) {
+                        dbg_io("flush wbuf to LEB %d:%d", wbuf->lnum,
+                                wbuf->offs);
+                        err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf,
+                                            wbuf->offs, c->min_io_size,
+                                            wbuf->dtype);
+                        if (err)
+                                goto out;
+                        spin_lock(&wbuf->lock);
+                        wbuf->offs += c->min_io_size;
+                        wbuf->avail = c->min_io_size;
+                        wbuf->used = 0;
+                        wbuf->next_ino = 0;
+                        spin_unlock(&wbuf->lock);
+                } else {
+                        spin_lock(&wbuf->lock);
+                        wbuf->avail -= aligned_len;
+                        wbuf->used += aligned_len;
+                        spin_unlock(&wbuf->lock);
+                }
+                goto exit;
+        }
+        /*
+         * The node is large enough and does not fit entirely within current
+         * minimal I/O unit. We have to fill and flush write-buffer and switch
+         * to the next min. I/O unit.
+         */
+        dbg_io("flush wbuf to LEB %d:%d", wbuf->lnum, wbuf->offs);
+        memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail);
+        err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,
+                            c->min_io_size, wbuf->dtype);
+        if (err)
+                goto out;
+        offs = wbuf->offs + c->min_io_size;
+        len -= wbuf->avail;
+        aligned_len -= wbuf->avail;
+        written = wbuf->avail;
+        /*
+         * The remaining data may take more whole min. I/O units, so write the
+         * remains multiple to min. I/O unit size directly to the flash media.
+         * We align node length to 8-byte boundary because we anyway flash wbuf
+         * if the remaining space is less than 8 bytes.
+         */
+        n = aligned_len >> c->min_io_shift;
+        if (n) {
+                n <<= c->min_io_shift;
+                dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, offs);
+                err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written, offs, n,
+                                    wbuf->dtype);
+                if (err)
+                        goto out;
+                offs += n;
+                aligned_len -= n;
+                len -= n;
+                written += n;
+        }
+        spin_lock(&wbuf->lock);
+        if (aligned_len)
+                /*
+                 * And now we have what's left and what does not take whole
+                 * min. I/O unit, so write it to the write-buffer and we are
+                 * done.
+                 */
+                memcpy(wbuf->buf, buf + written, len);
+        wbuf->offs = offs;
+        wbuf->used = aligned_len;
+        wbuf->avail = c->min_io_size - aligned_len;
+        wbuf->next_ino = 0;
+        spin_unlock(&wbuf->lock);
+exit:
+        if (wbuf->sync_callback) {
+                int free = c->leb_size - wbuf->offs - wbuf->used;
+                err = wbuf->sync_callback(c, wbuf->lnum, free, 0);
+                if (err)
+                        goto out;
+        }
+        if (wbuf->used)
+                new_wbuf_timer_nolock(wbuf);
+        return 0;
+out:
+        ubifs_err("cannot write %d bytes to LEB %d:%d, error %d",
+                  len, wbuf->lnum, wbuf->offs, err);
+        dbg_dump_node(c, buf);
+        dbg_dump_stack();
+        dbg_dump_leb(c, wbuf->lnum);
+        return err;
+}
+/**
+ * ubifs_write_node - write node to the media.
+ * @c: UBIFS file-system description object
+ * @buf: the node to write
+ * @len: node length
+ * @lnum: logical eraseblock number
+ * @offs: offset within the logical eraseblock
+ * @dtype: node life-time hint (%UBI_LONGTERM, %UBI_SHORTTERM, %UBI_UNKNOWN)
+ *
+ * This function automatically fills node magic number, assigns sequence
+ * number, and calculates node CRC checksum. The length of the @buf buffer has
+ * to be aligned to the minimal I/O unit size. This function automatically
+ * appends padding node and padding bytes if needed. Returns zero in case of
+ * success and a negative error code in case of failure.
+ */
+int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum,
+                     int offs, int dtype)
+{
+        int err, buf_len = ALIGN(len, c->min_io_size);
+        dbg_io("LEB %d:%d, %s, length %d (aligned %d)",
+               lnum, offs, dbg_ntype(((struct ubifs_ch *)buf)->node_type), len,
+               buf_len);
+        ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
+        ubifs_assert(offs % c->min_io_size == 0 && offs < c->leb_size);
+        if (c->ro_media)
+                return -EROFS;
+        ubifs_prepare_node(c, buf, len, 1);
+        err = ubi_leb_write(c->ubi, lnum, buf, offs, buf_len, dtype);
+        if (err) {
+                ubifs_err("cannot write %d bytes to LEB %d:%d, error %d",
+                          buf_len, lnum, offs, err);
+                dbg_dump_node(c, buf);
+                dbg_dump_stack();
+        }
+        return err;
+}
+/**
+ * ubifs_read_node_wbuf - read node from the media or write-buffer.
+ * @wbuf: wbuf to check for un-written data
+ * @buf: buffer to read to
+ * @type: node type
+ * @len: node length
+ * @lnum: logical eraseblock number
+ * @offs: offset within the logical eraseblock
+ *
+ * This function reads a node of known type and length, checks it and stores
+ * in @buf. If the node partially or fully sits in the write-buffer, this
+ * function takes data from the buffer, otherwise it reads the flash media.
+ * Returns zero in case of success, %-EUCLEAN if CRC mismatched and a negative
+ * error code in case of failure.
+ */
+int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
+                         int lnum, int offs)
+{
+        const struct ubifs_info *c = wbuf->c;
+        int err, rlen, overlap;
+        struct ubifs_ch *ch = buf;
+        dbg_io("LEB %d:%d, %s, length %d", lnum, offs, dbg_ntype(type), len);
+        ubifs_assert(wbuf && lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
+        ubifs_assert(!(offs & 7) && offs < c->leb_size);
+        ubifs_assert(type >= 0 && type < UBIFS_NODE_TYPES_CNT);
+        spin_lock(&wbuf->lock);
+        overlap = (lnum == wbuf->lnum && offs + len > wbuf->offs);
+        if (!overlap) {
+                /* We may safely unlock the write-buffer and read the data */
+                spin_unlock(&wbuf->lock);
+                return ubifs_read_node(c, buf, type, len, lnum, offs);
+        }
+        /* Don't read under wbuf */
+        rlen = wbuf->offs - offs;
+        if (rlen < 0)
+                rlen = 0;
+        /* Copy the rest from the write-buffer */
+        memcpy(buf + rlen, wbuf->buf + offs + rlen - wbuf->offs, len - rlen);
+        spin_unlock(&wbuf->lock);
+        if (rlen > 0) {
+                /* Read everything that goes before write-buffer */
+                err = ubi_read(c->ubi, lnum, buf, offs, rlen);
+                if (err && err != -EBADMSG) {
+                        ubifs_err("failed to read node %d from LEB %d:%d, "
+                                  "error %d", type, lnum, offs, err);
+                        dbg_dump_stack();
+                        return err;
+                }
+        }
+        if (type != ch->node_type) {
+                ubifs_err("bad node type (%d but expected %d)",
+                          ch->node_type, type);
+                goto out;
+        }
+        err = ubifs_check_node(c, buf, lnum, offs, 0);
+        if (err) {
+                ubifs_err("expected node type %d", type);
+                return err;
+        }
+        rlen = le32_to_cpu(ch->len);
+        if (rlen != len) {
+                ubifs_err("bad node length %d, expected %d", rlen, len);
+                goto out;
+        }
+        return 0;
+out:
+        ubifs_err("bad node at LEB %d:%d", lnum, offs);
+        dbg_dump_node(c, buf);
+        dbg_dump_stack();
+        return -EINVAL;
+}
+/**
+ * ubifs_read_node - read node.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to read to
+ * @type: node type
+ * @len: node length (not aligned)
+ * @lnum: logical eraseblock number
+ * @offs: offset within the logical eraseblock
+ *
+ * This function reads a node of known type and and length, checks it and
+ * stores in @buf. Returns zero in case of success, %-EUCLEAN if CRC mismatched
+ * and a negative error code in case of failure.
+ */
+int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len,
+                    int lnum, int offs)
+{
+        int err, l;
+        struct ubifs_ch *ch = buf;
+        dbg_io("LEB %d:%d, %s, length %d", lnum, offs, dbg_ntype(type), len);
+        ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
+        ubifs_assert(len >= UBIFS_CH_SZ && offs + len <= c->leb_size);
+        ubifs_assert(!(offs & 7) && offs < c->leb_size);
+        ubifs_assert(type >= 0 && type < UBIFS_NODE_TYPES_CNT);
+        err = ubi_read(c->ubi, lnum, buf, offs, len);
+        if (err && err != -EBADMSG) {
+                ubifs_err("cannot read node %d from LEB %d:%d, error %d",
+                          type, lnum, offs, err);
+                return err;
+        }
+        if (type != ch->node_type) {
+                ubifs_err("bad node type (%d but expected %d)",
+                          ch->node_type, type);
+                goto out;
+        }
+        err = ubifs_check_node(c, buf, lnum, offs, 0);
+        if (err) {
+                ubifs_err("expected node type %d", type);
+                return err;
+        }
+        l = le32_to_cpu(ch->len);
+        if (l != len) {
+                ubifs_err("bad node length %d, expected %d", l, len);
+                goto out;
+        }
+        return 0;
+out:
+        ubifs_err("bad node at LEB %d:%d", lnum, offs);
+        dbg_dump_node(c, buf);
+        dbg_dump_stack();
+        return -EINVAL;
+}
+/**
+ * ubifs_wbuf_init - initialize write-buffer.
+ * @c: UBIFS file-system description object
+ * @wbuf: write-buffer to initialize
+ *
+ * This function initializes write buffer. Returns zero in case of success
+ * %-ENOMEM in case of failure.
+ */
+int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf)
+{
+        size_t size;
+        wbuf->buf = kmalloc(c->min_io_size, GFP_KERNEL);
+        if (!wbuf->buf)
+                return -ENOMEM;
+        size = (c->min_io_size / UBIFS_CH_SZ + 1) * sizeof(ino_t);
+        wbuf->inodes = kmalloc(size, GFP_KERNEL);
+        if (!wbuf->inodes) {
+                kfree(wbuf->buf);
+                wbuf->buf = NULL;
+                return -ENOMEM;
+        }
+        wbuf->used = 0;
+        wbuf->lnum = wbuf->offs = -1;
+        wbuf->avail = c->min_io_size;
+        wbuf->dtype = UBI_UNKNOWN;
+        wbuf->sync_callback = NULL;
+        mutex_init(&wbuf->io_mutex);
+        spin_lock_init(&wbuf->lock);
+        wbuf->c = c;
+        init_timer(&wbuf->timer);
+        wbuf->timer.function = wbuf_timer_callback_nolock;
+        wbuf->timer.data = (unsigned long)wbuf;
+        wbuf->timeout = DEFAULT_WBUF_TIMEOUT;
+        wbuf->next_ino = 0;
+        return 0;
+}
+/**
+ * ubifs_wbuf_add_ino_nolock - add an inode number into the wbuf inode array.
+ * @wbuf: the write-buffer whereto add
+ * @inum: the inode number
+ *
+ * This function adds an inode number to the inode array of the write-buffer.
+ */
+void ubifs_wbuf_add_ino_nolock(struct ubifs_wbuf *wbuf, ino_t inum)
+{
+        if (!wbuf->buf)
+                /* NOR flash or something similar */
+                return;
+        spin_lock(&wbuf->lock);
+        if (wbuf->used)
+                wbuf->inodes[wbuf->next_ino++] = inum;
+        spin_unlock(&wbuf->lock);
+}
+/**
+ * wbuf_has_ino - returns if the wbuf contains data from the inode.
+ * @wbuf: the write-buffer
+ * @inum: the inode number
+ *
+ * This function returns with %1 if the write-buffer contains some data from the
+ * given inode otherwise it returns with %0.
+ */
+static int wbuf_has_ino(struct ubifs_wbuf *wbuf, ino_t inum)
+{
+        int i, ret = 0;
+        spin_lock(&wbuf->lock);
+        for (i = 0; i < wbuf->next_ino; i++)
+                if (inum == wbuf->inodes[i]) {
+                        ret = 1;
+                        break;
+                }
+        spin_unlock(&wbuf->lock);
+        return ret;
+}
+/**
+ * ubifs_sync_wbufs_by_inode - synchronize write-buffers for an inode.
+ * @c: UBIFS file-system description object
+ * @inode: inode to synchronize
+ *
+ * This function synchronizes write-buffers which contain nodes belonging to
+ * @inode. Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+int ubifs_sync_wbufs_by_inode(struct ubifs_info *c, struct inode *inode)
+{
+        int i, err = 0;
+        for (i = 0; i < c->jhead_cnt; i++) {
+                struct ubifs_wbuf *wbuf = &c->jheads[i].wbuf;
+                if (i == GCHD)
+                        /*
+                         * GC head is special, do not look at it. Even if the
+                         * head contains something related to this inode, it is
+                         * a _copy_ of corresponding on-flash node which sits
+                         * somewhere else.
+                         */
+                        continue;
+                if (!wbuf_has_ino(wbuf, inode->i_ino))
+                        continue;
+                mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+                if (wbuf_has_ino(wbuf, inode->i_ino))
+                        err = ubifs_wbuf_sync_nolock(wbuf);
+                mutex_unlock(&wbuf->io_mutex);
+                if (err) {
+                        ubifs_ro_mode(c, err);
+                        return err;
+                }
+        }
+        return 0;
+}
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
new file mode 100644
index 000000000000..5e82cffe9695
--- /dev/null
+++ b/fs/ubifs/ioctl.c
@@ -0,0 +1,204 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ * Copyright (C) 2006, 2007 University of Szeged, Hungary
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Zoltan Sogor
+ *          Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+/* This file implements EXT2-compatible extended attribute ioctl() calls */
+#include <linux/compat.h>
+#include <linux/smp_lock.h>
+#include <linux/mount.h>
+#include "ubifs.h"
+/**
+ * ubifs_set_inode_flags - set VFS inode flags.
+ * @inode: VFS inode to set flags for
+ *
+ * This function propagates flags from UBIFS inode object to VFS inode object.
+ */
+void ubifs_set_inode_flags(struct inode *inode)
+{
+        unsigned int flags = ubifs_inode(inode)->flags;
+        inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_DIRSYNC);
+        if (flags & UBIFS_SYNC_FL)
+                inode->i_flags |= S_SYNC;
+        if (flags & UBIFS_APPEND_FL)
+                inode->i_flags |= S_APPEND;
+        if (flags & UBIFS_IMMUTABLE_FL)
+                inode->i_flags |= S_IMMUTABLE;
+        if (flags & UBIFS_DIRSYNC_FL)
+                inode->i_flags |= S_DIRSYNC;
+}
+/*
+ * ioctl2ubifs - convert ioctl inode flags to UBIFS inode flags.
+ * @ioctl_flags: flags to convert
+ *
+ * This function convert ioctl flags (@FS_COMPR_FL, etc) to UBIFS inode flags
+ * (@UBIFS_COMPR_FL, etc).
+ */
+static int ioctl2ubifs(int ioctl_flags)
+{
+        int ubifs_flags = 0;
+        if (ioctl_flags & FS_COMPR_FL)
+                ubifs_flags |= UBIFS_COMPR_FL;
+        if (ioctl_flags & FS_SYNC_FL)
+                ubifs_flags |= UBIFS_SYNC_FL;
+        if (ioctl_flags & FS_APPEND_FL)
+                ubifs_flags |= UBIFS_APPEND_FL;
+        if (ioctl_flags & FS_IMMUTABLE_FL)
+                ubifs_flags |= UBIFS_IMMUTABLE_FL;
+        if (ioctl_flags & FS_DIRSYNC_FL)
+                ubifs_flags |= UBIFS_DIRSYNC_FL;
+        return ubifs_flags;
+}
+/*
+ * ubifs2ioctl - convert UBIFS inode flags to ioctl inode flags.
+ * @ubifs_flags: flags to convert
+ *
+ * This function convert UBIFS (@UBIFS_COMPR_FL, etc) to ioctl flags
+ * (@FS_COMPR_FL, etc).
+ */
+static int ubifs2ioctl(int ubifs_flags)
+{
+        int ioctl_flags = 0;
+        if (ubifs_flags & UBIFS_COMPR_FL)
+                ioctl_flags |= FS_COMPR_FL;
+        if (ubifs_flags & UBIFS_SYNC_FL)
+                ioctl_flags |= FS_SYNC_FL;
+        if (ubifs_flags & UBIFS_APPEND_FL)
+                ioctl_flags |= FS_APPEND_FL;
+        if (ubifs_flags & UBIFS_IMMUTABLE_FL)
+                ioctl_flags |= FS_IMMUTABLE_FL;
+        if (ubifs_flags & UBIFS_DIRSYNC_FL)
+                ioctl_flags |= FS_DIRSYNC_FL;
+        return ioctl_flags;
+}
+static int setflags(struct inode *inode, int flags)
+{
+        int oldflags, err, release;
+        struct ubifs_inode *ui = ubifs_inode(inode);
+        struct ubifs_info *c = inode->i_sb->s_fs_info;
+        struct ubifs_budget_req req = { .dirtied_ino = 1,
+                                        .dirtied_ino_d = ui->data_len };
+        err = ubifs_budget_space(c, &req);
+        if (err)
+                return err;
+        /*
+         * The IMMUTABLE and APPEND_ONLY flags can only be changed by
+         * the relevant capability.
+         */
+        mutex_lock(&ui->ui_mutex);
+        oldflags = ubifs2ioctl(ui->flags);
+        if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
+                if (!capable(CAP_LINUX_IMMUTABLE)) {
+                        err = -EPERM;
+                        goto out_unlock;
+                }
+        }
+        ui->flags = ioctl2ubifs(flags);
+        ubifs_set_inode_flags(inode);
+        inode->i_ctime = ubifs_current_time(inode);
+        release = ui->dirty;
+        mark_inode_dirty_sync(inode);
+        mutex_unlock(&ui->ui_mutex);
+        if (release)
+                ubifs_release_budget(c, &req);
+        if (IS_SYNC(inode))
+                err = write_inode_now(inode, 1);
+        return err;
+out_unlock:
+        ubifs_err("can't modify inode %lu attributes", inode->i_ino);
+        mutex_unlock(&ui->ui_mutex);
+        ubifs_release_budget(c, &req);
+        return err;
+}
+long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+        int flags, err;
+        struct inode *inode = file->f_path.dentry->d_inode;
+        switch (cmd) {
+        case FS_IOC_GETFLAGS:
+                flags = ubifs2ioctl(ubifs_inode(inode)->flags);
+                return put_user(flags, (int __user *) arg);
+        case FS_IOC_SETFLAGS: {
+                if (IS_RDONLY(inode))
+                        return -EROFS;
+                if (!is_owner_or_cap(inode))
+                        return -EACCES;
+                if (get_user(flags, (int __user *) arg))
+                        return -EFAULT;
+                if (!S_ISDIR(inode->i_mode))
+                        flags &= ~FS_DIRSYNC_FL;
+                /*
+                 * Make sure the file-system is read-write and make sure it
+                 * will not become read-only while we are changing the flags.
+                 */
+                err = mnt_want_write(file->f_path.mnt);
+                if (err)
+                        return err;
+                err = setflags(inode, flags);
+                mnt_drop_write(file->f_path.mnt);
+                return err;
+        }
+        default:
+                return -ENOTTY;
+        }
+}
+#ifdef CONFIG_COMPAT
+long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+        switch (cmd) {
+        case FS_IOC32_GETFLAGS:
+                cmd = FS_IOC_GETFLAGS;
+                break;
+        case FS_IOC32_SETFLAGS:
+                cmd = FS_IOC_SETFLAGS;
+                break;
+        default:
+                return -ENOIOCTLCMD;
+        }
+        return ubifs_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
new file mode 100644
index 000000000000..283155abe5f5
--- /dev/null
+++ b/fs/ubifs/journal.c
@@ -0,0 +1,1387 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+/*
+ * This file implements UBIFS journal.
+ *
+ * The journal consists of 2 parts - the log and bud LEBs. The log has fixed
+ * length and position, while a bud logical eraseblock is any LEB in the main
+ * area. Buds contain file system data - data nodes, inode nodes, etc. The log
+ * contains only references to buds and some other stuff like commit
+ * start node. The idea is that when we commit the journal, we do
+ * not copy the data, the buds just become indexed. Since after the commit the
+ * nodes in bud eraseblocks become leaf nodes of the file system index tree, we
+ * use term "bud". Analogy is obvious, bud eraseblocks contain nodes which will
+ * become leafs in the future.
+ *
+ * The journal is multi-headed because we want to write data to the journal as
+ * optimally as possible. It is nice to have nodes belonging to the same inode
+ * in one LEB, so we may write data owned by different inodes to different
+ * journal heads, although at present only one data head is used.
+ *
+ * For recovery reasons, the base head contains all inode nodes, all directory
+ * entry nodes and all truncate nodes. This means that the other heads contain
+ * only data nodes.
+ *
+ * Bud LEBs may be half-indexed. For example, if the bud was not full at the
+ * time of commit, the bud is retained to continue to be used in the journal,
+ * even though the "front" of the LEB is now indexed. In that case, the log
+ * reference contains the offset where the bud starts for the purposes of the
+ * journal.
+ *
+ * The journal size has to be limited, because the larger is the journal, the
+ * longer it takes to mount UBIFS (scanning the journal) and the more memory it
+ * takes (indexing in the TNC).
+ *
+ * All the journal write operations like 'ubifs_jnl_update()' here, which write
+ * multiple UBIFS nodes to the journal at one go, are atomic with respect to
+ * unclean reboots. Should the unclean reboot happen, the recovery code drops
+ * all the nodes.
+ */
+#include "ubifs.h"
+/**
+ * zero_ino_node_unused - zero out unused fields of an on-flash inode node.
+ * @ino: the inode to zero out
+ */
+static inline void zero_ino_node_unused(struct ubifs_ino_node *ino)
+{
+        memset(ino->padding1, 0, 4);
+        memset(ino->padding2, 0, 26);
+}
+/**
+ * zero_dent_node_unused - zero out unused fields of an on-flash directory
+ *                         entry node.
+ * @dent: the directory entry to zero out
+ */
+static inline void zero_dent_node_unused(struct ubifs_dent_node *dent)
+{
+        dent->padding1 = 0;
+        memset(dent->padding2, 0, 4);
+}
+/**
+ * zero_data_node_unused - zero out unused fields of an on-flash data node.
+ * @data: the data node to zero out
+ */
+static inline void zero_data_node_unused(struct ubifs_data_node *data)
+{
+        memset(data->padding, 0, 2);
+}
+/**
+ * zero_trun_node_unused - zero out unused fields of an on-flash truncation
+ *                         node.
+ * @trun: the truncation node to zero out
+ */
+static inline void zero_trun_node_unused(struct ubifs_trun_node *trun)
+{
+        memset(trun->padding, 0, 12);
+}
+/**
+ * reserve_space - reserve space in the journal.
+ * @c: UBIFS file-system description object
+ * @jhead: journal head number
+ * @len: node length
+ *
+ * This function reserves space in journal head @head. If the reservation
+ * succeeded, the journal head stays locked and later has to be unlocked using
+ * 'release_head()'. 'write_node()' and 'write_head()' functions also unlock
+ * it. Returns zero in case of success, %-EAGAIN if commit has to be done, and
+ * other negative error codes in case of other failures.
+ */
+static int reserve_space(struct ubifs_info *c, int jhead, int len)
+{
+        int err = 0, err1, retries = 0, avail, lnum, offs, free, squeeze;
+        struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf;
+        /*
+         * Typically, the base head has smaller nodes written to it, so it is
+         * better to try to allocate space at the ends of eraseblocks. This is
+         * what the squeeze parameter does.
+         */
+        squeeze = (jhead == BASEHD);
+again:
+        mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+        if (c->ro_media) {
+                err = -EROFS;
+                goto out_unlock;
+        }
+        avail = c->leb_size - wbuf->offs - wbuf->used;
+        if (wbuf->lnum != -1 && avail >= len)
+                return 0;
+        /*
+         * Write buffer wasn't seek'ed or there is no enough space - look for an
+         * LEB with some empty space.
+         */
+        lnum = ubifs_find_free_space(c, len, &free, squeeze);
+        if (lnum >= 0) {
+                /* Found an LEB, add it to the journal head */
+                offs = c->leb_size - free;
+                err = ubifs_add_bud_to_log(c, jhead, lnum, offs);
+                if (err)
+                        goto out_return;
+                /* A new bud was successfully allocated and added to the log */
+                goto out;
+        }
+        err = lnum;
+        if (err != -ENOSPC)
+                goto out_unlock;
+        /*
+         * No free space, we have to run garbage collector to make
+         * some. But the write-buffer mutex has to be unlocked because
+         * GC also takes it.
+         */
+        dbg_jnl("no free space  jhead %d, run GC", jhead);
+        mutex_unlock(&wbuf->io_mutex);
+        lnum = ubifs_garbage_collect(c, 0);
+        if (lnum < 0) {
+                err = lnum;
+                if (err != -ENOSPC)
+                        return err;
+                /*
+                 * GC could not make a free LEB. But someone else may
+                 * have allocated new bud for this journal head,
+                 * because we dropped @wbuf->io_mutex, so try once
+                 * again.
+                 */
+                dbg_jnl("GC couldn't make a free LEB for jhead %d", jhead);
+                if (retries++ < 2) {
+                        dbg_jnl("retry (%d)", retries);
+                        goto again;
+                }
+                dbg_jnl("return -ENOSPC");
+                return err;
+        }
+        mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+        dbg_jnl("got LEB %d for jhead %d", lnum, jhead);
+        avail = c->leb_size - wbuf->offs - wbuf->used;
+        if (wbuf->lnum != -1 && avail >= len) {
+                /*
+                 * Someone else has switched the journal head and we have
+                 * enough space now. This happens when more then one process is
+                 * trying to write to the same journal head at the same time.
+                 */
+                dbg_jnl("return LEB %d back, already have LEB %d:%d",
+                        lnum, wbuf->lnum, wbuf->offs + wbuf->used);
+                err = ubifs_return_leb(c, lnum);
+                if (err)
+                        goto out_unlock;
+                return 0;
+        }
+        err = ubifs_add_bud_to_log(c, jhead, lnum, 0);
+        if (err)
+                goto out_return;
+        offs = 0;
+out:
+        err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs, UBI_SHORTTERM);
+        if (err)
+                goto out_unlock;
+        return 0;
+out_unlock:
+        mutex_unlock(&wbuf->io_mutex);
+        return err;
+out_return:
+        /* An error occurred and the LEB has to be returned to lprops */
+        ubifs_assert(err < 0);
+        err1 = ubifs_return_leb(c, lnum);
+        if (err1 && err == -EAGAIN)
+                /*
+                 * Return original error code only if it is not %-EAGAIN,
+                 * which is not really an error. Otherwise, return the error
+                 * code of 'ubifs_return_leb()'.
+                 */
+                err = err1;
+        mutex_unlock(&wbuf->io_mutex);
+        return err;
+}
+/**
+ * write_node - write node to a journal head.
+ * @c: UBIFS file-system description object
+ * @jhead: journal head
+ * @node: node to write
+ * @len: node length
+ * @lnum: LEB number written is returned here
+ * @offs: offset written is returned here
+ *
+ * This function writes a node to reserved space of journal head @jhead.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+static int write_node(struct ubifs_info *c, int jhead, void *node, int len,
+                      int *lnum, int *offs)
+{
+        struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf;
+        ubifs_assert(jhead != GCHD);
+        *lnum = c->jheads[jhead].wbuf.lnum;
+        *offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used;
+        dbg_jnl("jhead %d, LEB %d:%d, len %d", jhead, *lnum, *offs, len);
+        ubifs_prepare_node(c, node, len, 0);
+        return ubifs_wbuf_write_nolock(wbuf, node, len);
+}
+/**
+ * write_head - write data to a journal head.
+ * @c: UBIFS file-system description object
+ * @jhead: journal head
+ * @buf: buffer to write
+ * @len: length to write
+ * @lnum: LEB number written is returned here
+ * @offs: offset written is returned here
+ * @sync: non-zero if the write-buffer has to by synchronized
+ *
+ * This function is the same as 'write_node()' but it does not assume the
+ * buffer it is writing is a node, so it does not prepare it (which means
+ * initializing common header and calculating CRC).
+ */
+static int write_head(struct ubifs_info *c, int jhead, void *buf, int len,
+                      int *lnum, int *offs, int sync)
+{
+        int err;
+        struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf;
+        ubifs_assert(jhead != GCHD);
+        *lnum = c->jheads[jhead].wbuf.lnum;
+        *offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used;
+        dbg_jnl("jhead %d, LEB %d:%d, len %d", jhead, *lnum, *offs, len);
+        err = ubifs_wbuf_write_nolock(wbuf, buf, len);
+        if (err)
+                return err;
+        if (sync)
+                err = ubifs_wbuf_sync_nolock(wbuf);
+        return err;
+}
+/**
+ * make_reservation - reserve journal space.
+ * @c: UBIFS file-system description object
+ * @jhead: journal head
+ * @len: how many bytes to reserve
+ *
+ * This function makes space reservation in journal head @jhead. The function
+ * takes the commit lock and locks the journal head, and the caller has to
+ * unlock the head and finish the reservation with 'finish_reservation()'.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ *
+ * Note, the journal head may be unlocked as soon as the data is written, while
+ * the commit lock has to be released after the data has been added to the
+ * TNC.
+ */
+static int make_reservation(struct ubifs_info *c, int jhead, int len)
+{
+        int err, cmt_retries = 0, nospc_retries = 0;
+again:
+        down_read(&c->commit_sem);
+        err = reserve_space(c, jhead, len);
+        if (!err)
+                return 0;
+        up_read(&c->commit_sem);
+        if (err == -ENOSPC) {
+                /*
+                 * GC could not make any progress. We should try to commit
+                 * once because it could make some dirty space and GC would
+                 * make progress, so make the error -EAGAIN so that the below
+                 * will commit and re-try.
+                 */
+                if (nospc_retries++ < 2) {
+                        dbg_jnl("no space, retry");
+                        err = -EAGAIN;
+                }
+                /*
+                 * This means that the budgeting is incorrect. We always have
+                 * to be able to write to the media, because all operations are
+                 * budgeted. Deletions are not budgeted, though, but we reserve
+                 * an extra LEB for them.
+                 */
+        }
+        if (err != -EAGAIN)
+                goto out;
+        /*
+         * -EAGAIN means that the journal is full or too large, or the above
+         * code wants to do one commit. Do this and re-try.
+         */
+        if (cmt_retries > 128) {
+                /*
+                 * This should not happen unless the journal size limitations
+                 * are too tough.
+                 */
+                ubifs_err("stuck in space allocation");
+                err = -ENOSPC;
+                goto out;
+        } else if (cmt_retries > 32)
+                ubifs_warn("too many space allocation re-tries (%d)",
+                           cmt_retries);
+        dbg_jnl("-EAGAIN, commit and retry (retried %d times)",
+                cmt_retries);
+        cmt_retries += 1;
+        err = ubifs_run_commit(c);
+        if (err)
+                return err;
+        goto again;
+out:
+        ubifs_err("cannot reserve %d bytes in jhead %d, error %d",
+                  len, jhead, err);
+        if (err == -ENOSPC) {
+                /* This are some budgeting problems, print useful information */
+                down_write(&c->commit_sem);
+                spin_lock(&c->space_lock);
+                dbg_dump_stack();
+                dbg_dump_budg(c);
+                spin_unlock(&c->space_lock);
+                dbg_dump_lprops(c);
+                cmt_retries = dbg_check_lprops(c);
+                up_write(&c->commit_sem);
+        }
+        return err;
+}
+/**
+ * release_head - release a journal head.
+ * @c: UBIFS file-system description object
+ * @jhead: journal head
+ *
+ * This function releases journal head @jhead which was locked by
+ * the 'make_reservation()' function. It has to be called after each successful
+ * 'make_reservation()' invocation.
+ */
+static inline void release_head(struct ubifs_info *c, int jhead)
+{
+        mutex_unlock(&c->jheads[jhead].wbuf.io_mutex);
+}
+/**
+ * finish_reservation - finish a reservation.
+ * @c: UBIFS file-system description object
+ *
+ * This function finishes journal space reservation. It must be called after
+ * 'make_reservation()'.
+ */
+static void finish_reservation(struct ubifs_info *c)
+{
+        up_read(&c->commit_sem);
+}
+/**
+ * get_dent_type - translate VFS inode mode to UBIFS directory entry type.
+ * @mode: inode mode
+ */
+static int get_dent_type(int mode)
+{
+        switch (mode & S_IFMT) {
+        case S_IFREG:
+                return UBIFS_ITYPE_REG;
+        case S_IFDIR:
+                return UBIFS_ITYPE_DIR;
+        case S_IFLNK:
+                return UBIFS_ITYPE_LNK;
+        case S_IFBLK:
+                return UBIFS_ITYPE_BLK;
+        case S_IFCHR:
+                return UBIFS_ITYPE_CHR;
+        case S_IFIFO:
+                return UBIFS_ITYPE_FIFO;
+        case S_IFSOCK:
+                return UBIFS_ITYPE_SOCK;
+        default:
+                BUG();
+        }
+        return 0;
+}
+/**
+ * pack_inode - pack an inode node.
+ * @c: UBIFS file-system description object
+ * @ino: buffer in which to pack inode node
+ * @inode: inode to pack
+ * @last: indicates the last node of the group
+ * @last_reference: non-zero if this is a deletion inode
+ */
+static void pack_inode(struct ubifs_info *c, struct ubifs_ino_node *ino,
+                       const struct inode *inode, int last,
+                       int last_reference)
+{
+        int data_len = 0;
+        struct ubifs_inode *ui = ubifs_inode(inode);
+        ino->ch.node_type = UBIFS_INO_NODE;
+        ino_key_init_flash(c, &ino->key, inode->i_ino);
+        ino->creat_sqnum = cpu_to_le64(ui->creat_sqnum);
+        ino->atime_sec  = cpu_to_le64(inode->i_atime.tv_sec);
+        ino->atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
+        ino->ctime_sec  = cpu_to_le64(inode->i_ctime.tv_sec);
+        ino->ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+        ino->mtime_sec  = cpu_to_le64(inode->i_mtime.tv_sec);
+        ino->mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+        ino->uid   = cpu_to_le32(inode->i_uid);
+        ino->gid   = cpu_to_le32(inode->i_gid);
+        ino->mode  = cpu_to_le32(inode->i_mode);
+        ino->flags = cpu_to_le32(ui->flags);
+        ino->size  = cpu_to_le64(ui->ui_size);
+        ino->nlink = cpu_to_le32(inode->i_nlink);
+        ino->compr_type  = cpu_to_le16(ui->compr_type);
+        ino->data_len    = cpu_to_le32(ui->data_len);
+        ino->xattr_cnt   = cpu_to_le32(ui->xattr_cnt);
+        ino->xattr_size  = cpu_to_le32(ui->xattr_size);
+        ino->xattr_names = cpu_to_le32(ui->xattr_names);
+        zero_ino_node_unused(ino);
+        /*
+         * Drop the attached data if this is a deletion inode, the data is not
+         * needed anymore.
+         */
+        if (!last_reference) {
+                memcpy(ino->data, ui->data, ui->data_len);
+                data_len = ui->data_len;
+        }
+        ubifs_prep_grp_node(c, ino, UBIFS_INO_NODE_SZ + data_len, last);
+}
+/**
+ * mark_inode_clean - mark UBIFS inode as clean.
+ * @c: UBIFS file-system description object
+ * @ui: UBIFS inode to mark as clean
+ *
+ * This helper function marks UBIFS inode @ui as clean by cleaning the
+ * @ui->dirty flag and releasing its budget. Note, VFS may still treat the
+ * inode as dirty and try to write it back, but 'ubifs_write_inode()' would
+ * just do nothing.
+ */
+static void mark_inode_clean(struct ubifs_info *c, struct ubifs_inode *ui)
+{
+        if (ui->dirty)
+                ubifs_release_dirty_inode_budget(c, ui);
+        ui->dirty = 0;
+}
+/**
+ * ubifs_jnl_update - update inode.
+ * @c: UBIFS file-system description object
+ * @dir: parent inode or host inode in case of extended attributes
+ * @nm: directory entry name
+ * @inode: inode to update
+ * @deletion: indicates a directory entry deletion i.e unlink or rmdir
+ * @xent: non-zero if the directory entry is an extended attribute entry
+ *
+ * This function updates an inode by writing a directory entry (or extended
+ * attribute entry), the inode itself, and the parent directory inode (or the
+ * host inode) to the journal.
+ *
+ * The function writes the host inode @dir last, which is important in case of
+ * extended attributes. Indeed, then we guarantee that if the host inode gets
+ * synchronized (with 'fsync()'), and the write-buffer it sits in gets flushed,
+ * the extended attribute inode gets flushed too. And this is exactly what the
+ * user expects - synchronizing the host inode synchronizes its extended
+ * attributes. Similarly, this guarantees that if @dir is synchronized, its
+ * directory entry corresponding to @nm gets synchronized too.
+ *
+ * If the inode (@inode) or the parent directory (@dir) are synchronous, this
+ * function synchronizes the write-buffer.
+ *
+ * This function marks the @dir and @inode inodes as clean and returns zero on
+ * success. In case of failure, a negative error code is returned.
+ */
+int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
+                     const struct qstr *nm, const struct inode *inode,
+                     int deletion, int xent)
+{
+        int err, dlen, ilen, len, lnum, ino_offs, dent_offs;
+        int aligned_dlen, aligned_ilen, sync = IS_DIRSYNC(dir);
+        int last_reference = !!(deletion && inode->i_nlink == 0);
+        struct ubifs_inode *ui = ubifs_inode(inode);
+        struct ubifs_inode *dir_ui = ubifs_inode(dir);
+        struct ubifs_dent_node *dent;
+        struct ubifs_ino_node *ino;
+        union ubifs_key dent_key, ino_key;
+        dbg_jnl("ino %lu, dent '%.*s', data len %d in dir ino %lu",
+                inode->i_ino, nm->len, nm->name, ui->data_len, dir->i_ino);
+        ubifs_assert(dir_ui->data_len == 0);
+        ubifs_assert(mutex_is_locked(&dir_ui->ui_mutex));
+        dlen = UBIFS_DENT_NODE_SZ + nm->len + 1;
+        ilen = UBIFS_INO_NODE_SZ;
+        /*
+         * If the last reference to the inode is being deleted, then there is
+         * no need to attach and write inode data, it is being deleted anyway.
+         * And if the inode is being deleted, no need to synchronize
+         * write-buffer even if the inode is synchronous.
+         */
+        if (!last_reference) {
+                ilen += ui->data_len;
+                sync |= IS_SYNC(inode);
+        }
+        aligned_dlen = ALIGN(dlen, 8);
+        aligned_ilen = ALIGN(ilen, 8);
+        len = aligned_dlen + aligned_ilen + UBIFS_INO_NODE_SZ;
+        dent = kmalloc(len, GFP_NOFS);
+        if (!dent)
+                return -ENOMEM;
+        /* Make reservation before allocating sequence numbers */
+        err = make_reservation(c, BASEHD, len);
+        if (err)
+                goto out_free;
+        if (!xent) {
+                dent->ch.node_type = UBIFS_DENT_NODE;
+                dent_key_init(c, &dent_key, dir->i_ino, nm);
+        } else {
+                dent->ch.node_type = UBIFS_XENT_NODE;
+                xent_key_init(c, &dent_key, dir->i_ino, nm);
+        }
+        key_write(c, &dent_key, dent->key);
+        dent->inum = deletion ? 0 : cpu_to_le64(inode->i_ino);
+        dent->type = get_dent_type(inode->i_mode);
+        dent->nlen = cpu_to_le16(nm->len);
+        memcpy(dent->name, nm->name, nm->len);
+        dent->name[nm->len] = '\0';
+        zero_dent_node_unused(dent);
+        ubifs_prep_grp_node(c, dent, dlen, 0);
+        ino = (void *)dent + aligned_dlen;
+        pack_inode(c, ino, inode, 0, last_reference);
+        ino = (void *)ino + aligned_ilen;
+        pack_inode(c, ino, dir, 1, 0);
+        if (last_reference) {
+                err = ubifs_add_orphan(c, inode->i_ino);
+                if (err) {
+                        release_head(c, BASEHD);
+                        goto out_finish;
+                }
+        }
+        err = write_head(c, BASEHD, dent, len, &lnum, &dent_offs, sync);
+        if (err)
+                goto out_release;
+        if (!sync) {
+                struct ubifs_wbuf *wbuf = &c->jheads[BASEHD].wbuf;
+                ubifs_wbuf_add_ino_nolock(wbuf, inode->i_ino);
+                ubifs_wbuf_add_ino_nolock(wbuf, dir->i_ino);
+        }
+        release_head(c, BASEHD);
+        kfree(dent);
+        if (deletion) {
+                err = ubifs_tnc_remove_nm(c, &dent_key, nm);
+                if (err)
+                        goto out_ro;
+                err = ubifs_add_dirt(c, lnum, dlen);
+        } else
+                err = ubifs_tnc_add_nm(c, &dent_key, lnum, dent_offs, dlen, nm);
+        if (err)
+                goto out_ro;
+        /*
+         * Note, we do not remove the inode from TNC even if the last reference
+         * to it has just been deleted, because the inode may still be opened.
+         * Instead, the inode has been added to orphan lists and the orphan
+         * subsystem will take further care about it.
+         */
+        ino_key_init(c, &ino_key, inode->i_ino);
+        ino_offs = dent_offs + aligned_dlen;
+        err = ubifs_tnc_add(c, &ino_key, lnum, ino_offs, ilen);
+        if (err)
+                goto out_ro;
+        ino_key_init(c, &ino_key, dir->i_ino);
+        ino_offs += aligned_ilen;
+        err = ubifs_tnc_add(c, &ino_key, lnum, ino_offs, UBIFS_INO_NODE_SZ);
+        if (err)
+                goto out_ro;
+        finish_reservation(c);
+        spin_lock(&ui->ui_lock);
+        ui->synced_i_size = ui->ui_size;
+        spin_unlock(&ui->ui_lock);
+        mark_inode_clean(c, ui);
+        mark_inode_clean(c, dir_ui);
+        return 0;
+out_finish:
+        finish_reservation(c);
+out_free:
+        kfree(dent);
+        return err;
+out_release:
+        release_head(c, BASEHD);
+out_ro:
+        ubifs_ro_mode(c, err);
+        if (last_reference)
+                ubifs_delete_orphan(c, inode->i_ino);
+        finish_reservation(c);
+        return err;
+}
+/**
+ * ubifs_jnl_write_data - write a data node to the journal.
+ * @c: UBIFS file-system description object
+ * @inode: inode the data node belongs to
+ * @key: node key
+ * @buf: buffer to write
+ * @len: data length (must not exceed %UBIFS_BLOCK_SIZE)
+ *
+ * This function writes a data node to the journal. Returns %0 if the data node
+ * was successfully written, and a negative error code in case of failure.
+ */
+int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
+                         const union ubifs_key *key, const void *buf, int len)
+{
+        struct ubifs_data_node *data;
+        int err, lnum, offs, compr_type, out_len;
+        int dlen = UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE * WORST_COMPR_FACTOR;
+        struct ubifs_inode *ui = ubifs_inode(inode);
+        dbg_jnl("ino %lu, blk %u, len %d, key %s", key_inum(c, key),
+                key_block(c, key), len, DBGKEY(key));
+        ubifs_assert(len <= UBIFS_BLOCK_SIZE);
+        data = kmalloc(dlen, GFP_NOFS);
+        if (!data)
+                return -ENOMEM;
+        data->ch.node_type = UBIFS_DATA_NODE;
+        key_write(c, key, &data->key);
+        data->size = cpu_to_le32(len);
+        zero_data_node_unused(data);
+        if (!(ui->flags && UBIFS_COMPR_FL))
+                /* Compression is disabled for this inode */
+                compr_type = UBIFS_COMPR_NONE;
+        else
+                compr_type = ui->compr_type;
+        out_len = dlen - UBIFS_DATA_NODE_SZ;
+        ubifs_compress(buf, len, &data->data, &out_len, &compr_type);
+        ubifs_assert(out_len <= UBIFS_BLOCK_SIZE);
+        dlen = UBIFS_DATA_NODE_SZ + out_len;
+        data->compr_type = cpu_to_le16(compr_type);
+        /* Make reservation before allocating sequence numbers */
+        err = make_reservation(c, DATAHD, dlen);
+        if (err)
+                goto out_free;
+        err = write_node(c, DATAHD, data, dlen, &lnum, &offs);
+        if (err)
+                goto out_release;
+        ubifs_wbuf_add_ino_nolock(&c->jheads[DATAHD].wbuf, key_inum(c, key));
+        release_head(c, DATAHD);
+        err = ubifs_tnc_add(c, key, lnum, offs, dlen);
+        if (err)
+                goto out_ro;
+        finish_reservation(c);
+        kfree(data);
+        return 0;
+out_release:
+        release_head(c, DATAHD);
+out_ro:
+        ubifs_ro_mode(c, err);
+        finish_reservation(c);
+out_free:
+        kfree(data);
+        return err;
+}
+/**
+ * ubifs_jnl_write_inode - flush inode to the journal.
+ * @c: UBIFS file-system description object
+ * @inode: inode to flush
+ * @deletion: inode has been deleted
+ *
+ * This function writes inode @inode to the journal. If the inode is
+ * synchronous, it also synchronizes the write-buffer. Returns zero in case of
+ * success and a negative error code in case of failure.
+ */
+int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode,
+                          int deletion)
+{
+        int err, len, lnum, offs, sync = 0;
+        struct ubifs_ino_node *ino;
+        struct ubifs_inode *ui = ubifs_inode(inode);
+        dbg_jnl("ino %lu%s", inode->i_ino,
+                deletion ? " (last reference)" : "");
+        if (deletion)
+                ubifs_assert(inode->i_nlink == 0);
+        len = UBIFS_INO_NODE_SZ;
+        /*
+         * If the inode is being deleted, do not write the attached data. No
+         * need to synchronize the write-buffer either.
+         */
+        if (!deletion) {
+                len += ui->data_len;
+                sync = IS_SYNC(inode);
+        }
+        ino = kmalloc(len, GFP_NOFS);
+        if (!ino)
+                return -ENOMEM;
+        /* Make reservation before allocating sequence numbers */
+        err = make_reservation(c, BASEHD, len);
+        if (err)
+                goto out_free;
+        pack_inode(c, ino, inode, 1, deletion);
+        err = write_head(c, BASEHD, ino, len, &lnum, &offs, sync);
+        if (err)
+                goto out_release;
+        if (!sync)
+                ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf,
+                                          inode->i_ino);
+        release_head(c, BASEHD);
+        if (deletion) {
+                err = ubifs_tnc_remove_ino(c, inode->i_ino);
+                if (err)
+                        goto out_ro;
+                ubifs_delete_orphan(c, inode->i_ino);
+                err = ubifs_add_dirt(c, lnum, len);
+        } else {
+                union ubifs_key key;
+                ino_key_init(c, &key, inode->i_ino);
+                err = ubifs_tnc_add(c, &key, lnum, offs, len);
+        }
+        if (err)
+                goto out_ro;
+        finish_reservation(c);
+        spin_lock(&ui->ui_lock);
+        ui->synced_i_size = ui->ui_size;
+        spin_unlock(&ui->ui_lock);
+        kfree(ino);
+        return 0;
+out_release:
+        release_head(c, BASEHD);
+out_ro:
+        ubifs_ro_mode(c, err);
+        finish_reservation(c);
+out_free:
+        kfree(ino);
+        return err;
+}
+/**
+ * ubifs_jnl_rename - rename a directory entry.
+ * @c: UBIFS file-system description object
+ * @old_dir: parent inode of directory entry to rename
+ * @old_dentry: directory entry to rename
+ * @new_dir: parent inode of directory entry to rename
+ * @new_dentry: new directory entry (or directory entry to replace)
+ * @sync: non-zero if the write-buffer has to be synchronized
+ *
+ * This function implements the re-name operation which may involve writing up
+ * to 3 inodes and 2 directory entries. It marks the written inodes as clean
+ * and returns zero on success. In case of failure, a negative error code is
+ * returned.
+ */
+int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
+                     const struct dentry *old_dentry,
+                     const struct inode *new_dir,
+                     const struct dentry *new_dentry, int sync)
+{
+        void *p;
+        union ubifs_key key;
+        struct ubifs_dent_node *dent, *dent2;
+        int err, dlen1, dlen2, ilen, lnum, offs, len;
+        const struct inode *old_inode = old_dentry->d_inode;
+        const struct inode *new_inode = new_dentry->d_inode;
+        int aligned_dlen1, aligned_dlen2, plen = UBIFS_INO_NODE_SZ;
+        int last_reference = !!(new_inode && new_inode->i_nlink == 0);
+        int move = (old_dir != new_dir);
+        struct ubifs_inode *uninitialized_var(new_ui);
+        dbg_jnl("dent '%.*s' in dir ino %lu to dent '%.*s' in dir ino %lu",
+                old_dentry->d_name.len, old_dentry->d_name.name,
+                old_dir->i_ino, new_dentry->d_name.len,
+                new_dentry->d_name.name, new_dir->i_ino);
+        ubifs_assert(ubifs_inode(old_dir)->data_len == 0);
+        ubifs_assert(ubifs_inode(new_dir)->data_len == 0);
+        ubifs_assert(mutex_is_locked(&ubifs_inode(old_dir)->ui_mutex));
+        ubifs_assert(mutex_is_locked(&ubifs_inode(new_dir)->ui_mutex));
+        dlen1 = UBIFS_DENT_NODE_SZ + new_dentry->d_name.len + 1;
+        dlen2 = UBIFS_DENT_NODE_SZ + old_dentry->d_name.len + 1;
+        if (new_inode) {
+                new_ui = ubifs_inode(new_inode);
+                ubifs_assert(mutex_is_locked(&new_ui->ui_mutex));
+                ilen = UBIFS_INO_NODE_SZ;
+                if (!last_reference)
+                        ilen += new_ui->data_len;
+        } else
+                ilen = 0;
+        aligned_dlen1 = ALIGN(dlen1, 8);
+        aligned_dlen2 = ALIGN(dlen2, 8);
+        len = aligned_dlen1 + aligned_dlen2 + ALIGN(ilen, 8) + ALIGN(plen, 8);
+        if (old_dir != new_dir)
+                len += plen;
+        dent = kmalloc(len, GFP_NOFS);
+        if (!dent)
+                return -ENOMEM;
+        /* Make reservation before allocating sequence numbers */
+        err = make_reservation(c, BASEHD, len);
+        if (err)
+                goto out_free;
+        /* Make new dent */
+        dent->ch.node_type = UBIFS_DENT_NODE;
+        dent_key_init_flash(c, &dent->key, new_dir->i_ino, &new_dentry->d_name);
+        dent->inum = cpu_to_le64(old_inode->i_ino);
+        dent->type = get_dent_type(old_inode->i_mode);
+        dent->nlen = cpu_to_le16(new_dentry->d_name.len);
+        memcpy(dent->name, new_dentry->d_name.name, new_dentry->d_name.len);
+        dent->name[new_dentry->d_name.len] = '\0';
+        zero_dent_node_unused(dent);
+        ubifs_prep_grp_node(c, dent, dlen1, 0);
+        /* Make deletion dent */
+        dent2 = (void *)dent + aligned_dlen1;
+        dent2->ch.node_type = UBIFS_DENT_NODE;
+        dent_key_init_flash(c, &dent2->key, old_dir->i_ino,
+                            &old_dentry->d_name);
+        dent2->inum = 0;
+        dent2->type = DT_UNKNOWN;
+        dent2->nlen = cpu_to_le16(old_dentry->d_name.len);
+        memcpy(dent2->name, old_dentry->d_name.name, old_dentry->d_name.len);
+        dent2->name[old_dentry->d_name.len] = '\0';
+        zero_dent_node_unused(dent2);
+        ubifs_prep_grp_node(c, dent2, dlen2, 0);
+        p = (void *)dent2 + aligned_dlen2;
+        if (new_inode) {
+                pack_inode(c, p, new_inode, 0, last_reference);
+                p += ALIGN(ilen, 8);
+        }
+        if (!move)
+                pack_inode(c, p, old_dir, 1, 0);
+        else {
+                pack_inode(c, p, old_dir, 0, 0);
+                p += ALIGN(plen, 8);
+                pack_inode(c, p, new_dir, 1, 0);
+        }
+        if (last_reference) {
+                err = ubifs_add_orphan(c, new_inode->i_ino);
+                if (err) {
+                        release_head(c, BASEHD);
+                        goto out_finish;
+                }
+        }
+        err = write_head(c, BASEHD, dent, len, &lnum, &offs, sync);
+        if (err)
+                goto out_release;
+        if (!sync) {
+                struct ubifs_wbuf *wbuf = &c->jheads[BASEHD].wbuf;
+                ubifs_wbuf_add_ino_nolock(wbuf, new_dir->i_ino);
+                ubifs_wbuf_add_ino_nolock(wbuf, old_dir->i_ino);
+                if (new_inode)
+                        ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf,
+                                                  new_inode->i_ino);
+        }
+        release_head(c, BASEHD);
+        dent_key_init(c, &key, new_dir->i_ino, &new_dentry->d_name);
+        err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, &new_dentry->d_name);
+        if (err)
+                goto out_ro;
+        err = ubifs_add_dirt(c, lnum, dlen2);
+        if (err)
+                goto out_ro;
+        dent_key_init(c, &key, old_dir->i_ino, &old_dentry->d_name);
+        err = ubifs_tnc_remove_nm(c, &key, &old_dentry->d_name);
+        if (err)
+                goto out_ro;
+        offs += aligned_dlen1 + aligned_dlen2;
+        if (new_inode) {
+                ino_key_init(c, &key, new_inode->i_ino);
+                err = ubifs_tnc_add(c, &key, lnum, offs, ilen);
+                if (err)
+                        goto out_ro;
+                offs += ALIGN(ilen, 8);
+        }
+        ino_key_init(c, &key, old_dir->i_ino);
+        err = ubifs_tnc_add(c, &key, lnum, offs, plen);
+        if (err)
+                goto out_ro;
+        if (old_dir != new_dir) {
+                offs += ALIGN(plen, 8);
+                ino_key_init(c, &key, new_dir->i_ino);
+                err = ubifs_tnc_add(c, &key, lnum, offs, plen);
+                if (err)
+                        goto out_ro;
+        }
+        finish_reservation(c);
+        if (new_inode) {
+                mark_inode_clean(c, new_ui);
+                spin_lock(&new_ui->ui_lock);
+                new_ui->synced_i_size = new_ui->ui_size;
+                spin_unlock(&new_ui->ui_lock);
+        }
+        mark_inode_clean(c, ubifs_inode(old_dir));
+        if (move)
+                mark_inode_clean(c, ubifs_inode(new_dir));
+        kfree(dent);
+        return 0;
+out_release:
+        release_head(c, BASEHD);
+out_ro:
+        ubifs_ro_mode(c, err);
+        if (last_reference)
+                ubifs_delete_orphan(c, new_inode->i_ino);
+out_finish:
+        finish_reservation(c);
+out_free:
+        kfree(dent);
+        return err;
+}
+/**
+ * recomp_data_node - re-compress a truncated data node.
+ * @dn: data node to re-compress
+ * @new_len: new length
+ *
+ * This function is used when an inode is truncated and the last data node of
+ * the inode has to be re-compressed and re-written.
+ */
+static int recomp_data_node(struct ubifs_data_node *dn, int *new_len)
+{
+        void *buf;
+        int err, len, compr_type, out_len;
+        out_len = le32_to_cpu(dn->size);
+        buf = kmalloc(out_len * WORST_COMPR_FACTOR, GFP_NOFS);
+        if (!buf)
+                return -ENOMEM;
+        len = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
+        compr_type = le16_to_cpu(dn->compr_type);
+        err = ubifs_decompress(&dn->data, len, buf, &out_len, compr_type);
+        if (err)
+                goto out;
+        ubifs_compress(buf, *new_len, &dn->data, &out_len, &compr_type);
+        ubifs_assert(out_len <= UBIFS_BLOCK_SIZE);
+        dn->compr_type = cpu_to_le16(compr_type);
+        dn->size = cpu_to_le32(*new_len);
+        *new_len = UBIFS_DATA_NODE_SZ + out_len;
+out:
+        kfree(buf);
+        return err;
+}
+/**
+ * ubifs_jnl_truncate - update the journal for a truncation.
+ * @c: UBIFS file-system description object
+ * @inode: inode to truncate
+ * @old_size: old size
+ * @new_size: new size
+ *
+ * When the size of a file decreases due to truncation, a truncation node is
+ * written, the journal tree is updated, and the last data block is re-written
+ * if it has been affected. The inode is also updated in order to synchronize
+ * the new inode size.
+ *
+ * This function marks the inode as clean and returns zero on success. In case
+ * of failure, a negative error code is returned.
+ */
+int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
+                       loff_t old_size, loff_t new_size)
+{
+        union ubifs_key key, to_key;
+        struct ubifs_ino_node *ino;
+        struct ubifs_trun_node *trun;
+        struct ubifs_data_node *uninitialized_var(dn);
+        int err, dlen, len, lnum, offs, bit, sz, sync = IS_SYNC(inode);
+        struct ubifs_inode *ui = ubifs_inode(inode);
+        ino_t inum = inode->i_ino;
+        unsigned int blk;
+        dbg_jnl("ino %lu, size %lld -> %lld", inum, old_size, new_size);
+        ubifs_assert(!ui->data_len);
+        ubifs_assert(S_ISREG(inode->i_mode));
+        ubifs_assert(mutex_is_locked(&ui->ui_mutex));
+        sz = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ +
+             UBIFS_MAX_DATA_NODE_SZ * WORST_COMPR_FACTOR;
+        ino = kmalloc(sz, GFP_NOFS);
+        if (!ino)
+                return -ENOMEM;
+        trun = (void *)ino + UBIFS_INO_NODE_SZ;
+        trun->ch.node_type = UBIFS_TRUN_NODE;
+        trun->inum = cpu_to_le32(inum);
+        trun->old_size = cpu_to_le64(old_size);
+        trun->new_size = cpu_to_le64(new_size);
+        zero_trun_node_unused(trun);
+        dlen = new_size & (UBIFS_BLOCK_SIZE - 1);
+        if (dlen) {
+                /* Get last data block so it can be truncated */
+                dn = (void *)trun + UBIFS_TRUN_NODE_SZ;
+                blk = new_size >> UBIFS_BLOCK_SHIFT;
+                data_key_init(c, &key, inum, blk);
+                dbg_jnl("last block key %s", DBGKEY(&key));
+                err = ubifs_tnc_lookup(c, &key, dn);
+                if (err == -ENOENT)
+                        dlen = 0; /* Not found (so it is a hole) */
+                else if (err)
+                        goto out_free;
+                else {
+                        if (le32_to_cpu(dn->size) <= dlen)
+                                dlen = 0; /* Nothing to do */
+                        else {
+                                int compr_type = le16_to_cpu(dn->compr_type);
+                                if (compr_type != UBIFS_COMPR_NONE) {
+                                        err = recomp_data_node(dn, &dlen);
+                                        if (err)
+                                                goto out_free;
+                                } else {
+                                        dn->size = cpu_to_le32(dlen);
+                                        dlen += UBIFS_DATA_NODE_SZ;
+                                }
+                                zero_data_node_unused(dn);
+                        }
+                }
+        }
+        /* Must make reservation before allocating sequence numbers */
+        len = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ;
+        if (dlen)
+                len += dlen;
+        err = make_reservation(c, BASEHD, len);
+        if (err)
+                goto out_free;
+        pack_inode(c, ino, inode, 0, 0);
+        ubifs_prep_grp_node(c, trun, UBIFS_TRUN_NODE_SZ, dlen ? 0 : 1);
+        if (dlen)
+                ubifs_prep_grp_node(c, dn, dlen, 1);
+        err = write_head(c, BASEHD, ino, len, &lnum, &offs, sync);
+        if (err)
+                goto out_release;
+        if (!sync)
+                ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, inum);
+        release_head(c, BASEHD);
+        if (dlen) {
+                sz = offs + UBIFS_INO_NODE_SZ + UBIFS_TRUN_NODE_SZ;
+                err = ubifs_tnc_add(c, &key, lnum, sz, dlen);
+                if (err)
+                        goto out_ro;
+        }
+        ino_key_init(c, &key, inum);
+        err = ubifs_tnc_add(c, &key, lnum, offs, UBIFS_INO_NODE_SZ);
+        if (err)
+                goto out_ro;
+        err = ubifs_add_dirt(c, lnum, UBIFS_TRUN_NODE_SZ);
+        if (err)
+                goto out_ro;
+        bit = new_size & (UBIFS_BLOCK_SIZE - 1);
+        blk = (new_size >> UBIFS_BLOCK_SHIFT) + (bit ? 1 : 0);
+        data_key_init(c, &key, inum, blk);
+        bit = old_size & (UBIFS_BLOCK_SIZE - 1);
+        blk = (old_size >> UBIFS_BLOCK_SHIFT) - (bit ? 0: 1);
+        data_key_init(c, &to_key, inum, blk);
+        err = ubifs_tnc_remove_range(c, &key, &to_key);
+        if (err)
+                goto out_ro;
+        finish_reservation(c);
+        spin_lock(&ui->ui_lock);
+        ui->synced_i_size = ui->ui_size;
+        spin_unlock(&ui->ui_lock);
+        mark_inode_clean(c, ui);
+        kfree(ino);
+        return 0;
+out_release:
+        release_head(c, BASEHD);
+out_ro:
+        ubifs_ro_mode(c, err);
+        finish_reservation(c);
+out_free:
+        kfree(ino);
+        return err;
+}
+#ifdef CONFIG_UBIFS_FS_XATTR
+/**
+ * ubifs_jnl_delete_xattr - delete an extended attribute.
+ * @c: UBIFS file-system description object
+ * @host: host inode
+ * @inode: extended attribute inode
+ * @nm: extended attribute entry name
+ *
+ * This function delete an extended attribute which is very similar to
+ * un-linking regular files - it writes a deletion xentry, a deletion inode and
+ * updates the target inode. Returns zero in case of success and a negative
+ * error code in case of failure.
+ */
+int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host,
+                           const struct inode *inode, const struct qstr *nm)
+{
+        int err, xlen, hlen, len, lnum, xent_offs, aligned_xlen;
+        struct ubifs_dent_node *xent;
+        struct ubifs_ino_node *ino;
+        union ubifs_key xent_key, key1, key2;
+        int sync = IS_DIRSYNC(host);
+        struct ubifs_inode *host_ui = ubifs_inode(host);
+        dbg_jnl("host %lu, xattr ino %lu, name '%s', data len %d",
+                host->i_ino, inode->i_ino, nm->name,
+                ubifs_inode(inode)->data_len);
+        ubifs_assert(inode->i_nlink == 0);
+        ubifs_assert(mutex_is_locked(&host_ui->ui_mutex));
+        /*
+         * Since we are deleting the inode, we do not bother to attach any data
+         * to it and assume its length is %UBIFS_INO_NODE_SZ.
+         */
+        xlen = UBIFS_DENT_NODE_SZ + nm->len + 1;
+        aligned_xlen = ALIGN(xlen, 8);
+        hlen = host_ui->data_len + UBIFS_INO_NODE_SZ;
+        len = aligned_xlen + UBIFS_INO_NODE_SZ + ALIGN(hlen, 8);
+        xent = kmalloc(len, GFP_NOFS);
+        if (!xent)
+                return -ENOMEM;
+        /* Make reservation before allocating sequence numbers */
+        err = make_reservation(c, BASEHD, len);
+        if (err) {
+                kfree(xent);
+                return err;
+        }
+        xent->ch.node_type = UBIFS_XENT_NODE;
+        xent_key_init(c, &xent_key, host->i_ino, nm);
+        key_write(c, &xent_key, xent->key);
+        xent->inum = 0;
+        xent->type = get_dent_type(inode->i_mode);
+        xent->nlen = cpu_to_le16(nm->len);
+        memcpy(xent->name, nm->name, nm->len);
+        xent->name[nm->len] = '\0';
+        zero_dent_node_unused(xent);
+        ubifs_prep_grp_node(c, xent, xlen, 0);
+        ino = (void *)xent + aligned_xlen;
+        pack_inode(c, ino, inode, 0, 1);
+        ino = (void *)ino + UBIFS_INO_NODE_SZ;
+        pack_inode(c, ino, host, 1, 0);
+        err = write_head(c, BASEHD, xent, len, &lnum, &xent_offs, sync);
+        if (!sync && !err)
+                ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, host->i_ino);
+        release_head(c, BASEHD);
+        kfree(xent);
+        if (err)
+                goto out_ro;
+        /* Remove the extended attribute entry from TNC */
+        err = ubifs_tnc_remove_nm(c, &xent_key, nm);
+        if (err)
+                goto out_ro;
+        err = ubifs_add_dirt(c, lnum, xlen);
+        if (err)
+                goto out_ro;
+        /*
+         * Remove all nodes belonging to the extended attribute inode from TNC.
+         * Well, there actually must be only one node - the inode itself.
+         */
+        lowest_ino_key(c, &key1, inode->i_ino);
+        highest_ino_key(c, &key2, inode->i_ino);
+        err = ubifs_tnc_remove_range(c, &key1, &key2);
+        if (err)
+                goto out_ro;
+        err = ubifs_add_dirt(c, lnum, UBIFS_INO_NODE_SZ);
+        if (err)
+                goto out_ro;
+        /* And update TNC with the new host inode position */
+        ino_key_init(c, &key1, host->i_ino);
+        err = ubifs_tnc_add(c, &key1, lnum, xent_offs + len - hlen, hlen);
+        if (err)
+                goto out_ro;
+        finish_reservation(c);
+        spin_lock(&host_ui->ui_lock);
+        host_ui->synced_i_size = host_ui->ui_size;
+        spin_unlock(&host_ui->ui_lock);
+        mark_inode_clean(c, host_ui);
+        return 0;
+out_ro:
+        ubifs_ro_mode(c, err);
+        finish_reservation(c);
+        return err;
+}
+/**
+ * ubifs_jnl_change_xattr - change an extended attribute.
+ * @c: UBIFS file-system description object
+ * @inode: extended attribute inode
+ * @host: host inode
+ *
+ * This function writes the updated version of an extended attribute inode and
+ * the host inode tho the journal (to the base head). The host inode is written
+ * after the extended attribute inode in order to guarantee that the extended
+ * attribute will be flushed when the inode is synchronized by 'fsync()' and
+ * consequently, the write-buffer is synchronized. This function returns zero
+ * in case of success and a negative error code in case of failure.
+ */
+int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode,
+                           const struct inode *host)
+{
+        int err, len1, len2, aligned_len, aligned_len1, lnum, offs;
+        struct ubifs_inode *host_ui = ubifs_inode(inode);
+        struct ubifs_ino_node *ino;
+        union ubifs_key key;
+        int sync = IS_DIRSYNC(host);
+        dbg_jnl("ino %lu, ino %lu", host->i_ino, inode->i_ino);
+        ubifs_assert(host->i_nlink > 0);
+        ubifs_assert(inode->i_nlink > 0);
+        ubifs_assert(mutex_is_locked(&host_ui->ui_mutex));
+        len1 = UBIFS_INO_NODE_SZ + host_ui->data_len;
+        len2 = UBIFS_INO_NODE_SZ + ubifs_inode(inode)->data_len;
+        aligned_len1 = ALIGN(len1, 8);
+        aligned_len = aligned_len1 + ALIGN(len2, 8);
+        ino = kmalloc(aligned_len, GFP_NOFS);
+        if (!ino)
+                return -ENOMEM;
+        /* Make reservation before allocating sequence numbers */
+        err = make_reservation(c, BASEHD, aligned_len);
+        if (err)
+                goto out_free;
+        pack_inode(c, ino, host, 0, 0);
+        pack_inode(c, (void *)ino + aligned_len1, inode, 1, 0);
+        err = write_head(c, BASEHD, ino, aligned_len, &lnum, &offs, 0);
+        if (!sync && !err) {
+                struct ubifs_wbuf *wbuf = &c->jheads[BASEHD].wbuf;
+                ubifs_wbuf_add_ino_nolock(wbuf, host->i_ino);
+                ubifs_wbuf_add_ino_nolock(wbuf, inode->i_ino);
+        }
+        release_head(c, BASEHD);
+        if (err)
+                goto out_ro;
+        ino_key_init(c, &key, host->i_ino);
+        err = ubifs_tnc_add(c, &key, lnum, offs, len1);
+        if (err)
+                goto out_ro;
+        ino_key_init(c, &key, inode->i_ino);
+        err = ubifs_tnc_add(c, &key, lnum, offs + aligned_len1, len2);
+        if (err)
+                goto out_ro;
+        finish_reservation(c);
+        spin_lock(&host_ui->ui_lock);
+        host_ui->synced_i_size = host_ui->ui_size;
+        spin_unlock(&host_ui->ui_lock);
+        mark_inode_clean(c, host_ui);
+        kfree(ino);
+        return 0;
+out_ro:
+        ubifs_ro_mode(c, err);
+        finish_reservation(c);
+out_free:
+        kfree(ino);
+        return err;
+}
+#endif /* CONFIG_UBIFS_FS_XATTR */
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
new file mode 100644
index 000000000000..8f7476007549
--- /dev/null
+++ b/fs/ubifs/key.h
@@ -0,0 +1,533 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+/*
+ * This header contains various key-related definitions and helper function.
+ * UBIFS allows several key schemes, so we access key fields only via these
+ * helpers. At the moment only one key scheme is supported.
+ *
+ * Simple key scheme
+ * ~~~~~~~~~~~~~~~~~
+ *
+ * Keys are 64-bits long. First 32-bits are inode number (parent inode number
+ * in case of direntry key). Next 3 bits are node type. The last 29 bits are
+ * 4KiB offset in case of inode node, and direntry hash in case of a direntry
+ * node. We use "r5" hash borrowed from reiserfs.
+ */
+#ifndef __UBIFS_KEY_H__
+#define __UBIFS_KEY_H__
+/**
+ * key_r5_hash - R5 hash function (borrowed from reiserfs).
+ * @s: direntry name
+ * @len: name length
+ */
+static inline uint32_t key_r5_hash(const char *s, int len)
+{
+        uint32_t a = 0;
+        const signed char *str = (const signed char *)s;
+        while (*str) {
+                a += *str << 4;
+                a += *str >> 4;
+                a *= 11;
+                str++;
+        }
+        a &= UBIFS_S_KEY_HASH_MASK;
+        /*
+         * We use hash values as offset in directories, so values %0 and %1 are
+         * reserved for "." and "..". %2 is reserved for "end of readdir"
+         * marker.
+         */
+        if (unlikely(a >= 0 && a <= 2))
+                a += 3;
+        return a;
+}
+/**
+ * key_test_hash - testing hash function.
+ * @str: direntry name
+ * @len: name length
+ */
+static inline uint32_t key_test_hash(const char *str, int len)
+{
+        uint32_t a = 0;
+        len = min_t(uint32_t, len, 4);
+        memcpy(&a, str, len);
+        a &= UBIFS_S_KEY_HASH_MASK;
+        if (unlikely(a >= 0 && a <= 2))
+                a += 3;
+        return a;
+}
+/**
+ * ino_key_init - initialize inode key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: inode number
+ */
+static inline void ino_key_init(const struct ubifs_info *c,
+                                union ubifs_key *key, ino_t inum)
+{
+        key->u32[0] = inum;
+        key->u32[1] = UBIFS_INO_KEY << UBIFS_S_KEY_BLOCK_BITS;
+}
+/**
+ * ino_key_init_flash - initialize on-flash inode key.
+ * @c: UBIFS file-system description object
+ * @k: key to initialize
+ * @inum: inode number
+ */
+static inline void ino_key_init_flash(const struct ubifs_info *c, void *k,
+                                      ino_t inum)
+{
+        union ubifs_key *key = k;
+        key->j32[0] = cpu_to_le32(inum);
+        key->j32[1] = cpu_to_le32(UBIFS_INO_KEY << UBIFS_S_KEY_BLOCK_BITS);
+        memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8);
+}
+/**
+ * lowest_ino_key - get the lowest possible inode key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: inode number
+ */
+static inline void lowest_ino_key(const struct ubifs_info *c,
+                                union ubifs_key *key, ino_t inum)
+{
+        key->u32[0] = inum;
+        key->u32[1] = 0;
+}
+/**
+ * highest_ino_key - get the highest possible inode key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: inode number
+ */
+static inline void highest_ino_key(const struct ubifs_info *c,
+                                union ubifs_key *key, ino_t inum)
+{
+        key->u32[0] = inum;
+        key->u32[1] = 0xffffffff;
+}
+/**
+ * dent_key_init - initialize directory entry key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: parent inode number
+ * @nm: direntry name and length
+ */
+static inline void dent_key_init(const struct ubifs_info *c,
+                                 union ubifs_key *key, ino_t inum,
+                                 const struct qstr *nm)
+{
+        uint32_t hash = c->key_hash(nm->name, nm->len);
+        ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
+        key->u32[0] = inum;
+        key->u32[1] = hash | (UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS);
+}
+/**
+ * dent_key_init_hash - initialize directory entry key without re-calculating
+ *                      hash function.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: parent inode number
+ * @hash: direntry name hash
+ */
+static inline void dent_key_init_hash(const struct ubifs_info *c,
+                                      union ubifs_key *key, ino_t inum,
+                                      uint32_t hash)
+{
+        ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
+        key->u32[0] = inum;
+        key->u32[1] = hash | (UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS);
+}
+/**
+ * dent_key_init_flash - initialize on-flash directory entry key.
+ * @c: UBIFS file-system description object
+ * @k: key to initialize
+ * @inum: parent inode number
+ * @nm: direntry name and length
+ */
+static inline void dent_key_init_flash(const struct ubifs_info *c, void *k,
+                                       ino_t inum, const struct qstr *nm)
+{
+        union ubifs_key *key = k;
+        uint32_t hash = c->key_hash(nm->name, nm->len);
+        ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
+        key->j32[0] = cpu_to_le32(inum);
+        key->j32[1] = cpu_to_le32(hash |
+                                  (UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS));
+        memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8);
+}
+/**
+ * lowest_dent_key - get the lowest possible directory entry key.
+ * @c: UBIFS file-system description object
+ * @key: where to store the lowest key
+ * @inum: parent inode number
+ */
+static inline void lowest_dent_key(const struct ubifs_info *c,
+                                   union ubifs_key *key, ino_t inum)
+{
+        key->u32[0] = inum;
+        key->u32[1] = UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS;
+}
+/**
+ * xent_key_init - initialize extended attribute entry key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: host inode number
+ * @nm: extended attribute entry name and length
+ */
+static inline void xent_key_init(const struct ubifs_info *c,
+                                 union ubifs_key *key, ino_t inum,
+                                 const struct qstr *nm)
+{
+        uint32_t hash = c->key_hash(nm->name, nm->len);
+        ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
+        key->u32[0] = inum;
+        key->u32[1] = hash | (UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS);
+}
+/**
+ * xent_key_init_hash - initialize extended attribute entry key without
+ *                      re-calculating hash function.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: host inode number
+ * @hash: extended attribute entry name hash
+ */
+static inline void xent_key_init_hash(const struct ubifs_info *c,
+                                      union ubifs_key *key, ino_t inum,
+                                      uint32_t hash)
+{
+        ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
+        key->u32[0] = inum;
+        key->u32[1] = hash | (UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS);
+}
+/**
+ * xent_key_init_flash - initialize on-flash extended attribute entry key.
+ * @c: UBIFS file-system description object
+ * @k: key to initialize
+ * @inum: host inode number
+ * @nm: extended attribute entry name and length
+ */
+static inline void xent_key_init_flash(const struct ubifs_info *c, void *k,
+                                       ino_t inum, const struct qstr *nm)
+{
+        union ubifs_key *key = k;
+        uint32_t hash = c->key_hash(nm->name, nm->len);
+        ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
+        key->j32[0] = cpu_to_le32(inum);
+        key->j32[1] = cpu_to_le32(hash |
+                                  (UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS));
+        memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8);
+}
+/**
+ * lowest_xent_key - get the lowest possible extended attribute entry key.
+ * @c: UBIFS file-system description object
+ * @key: where to store the lowest key
+ * @inum: host inode number
+ */
+static inline void lowest_xent_key(const struct ubifs_info *c,
+                                   union ubifs_key *key, ino_t inum)
+{
+        key->u32[0] = inum;
+        key->u32[1] = UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS;
+}
+/**
+ * data_key_init - initialize data key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: inode number
+ * @block: block number
+ */
+static inline void data_key_init(const struct ubifs_info *c,
+                                 union ubifs_key *key, ino_t inum,
+                                 unsigned int block)
+{
+        ubifs_assert(!(block & ~UBIFS_S_KEY_BLOCK_MASK));
+        key->u32[0] = inum;
+        key->u32[1] = block | (UBIFS_DATA_KEY << UBIFS_S_KEY_BLOCK_BITS);
+}
+/**
+ * data_key_init_flash - initialize on-flash data key.
+ * @c: UBIFS file-system description object
+ * @k: key to initialize
+ * @inum: inode number
+ * @block: block number
+ */
+static inline void data_key_init_flash(const struct ubifs_info *c, void *k,
+                                       ino_t inum, unsigned int block)
+{
+        union ubifs_key *key = k;
+        ubifs_assert(!(block & ~UBIFS_S_KEY_BLOCK_MASK));
+        key->j32[0] = cpu_to_le32(inum);
+        key->j32[1] = cpu_to_le32(block |
+                                  (UBIFS_DATA_KEY << UBIFS_S_KEY_BLOCK_BITS));
+        memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8);
+}
+/**
+ * trun_key_init - initialize truncation node key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: inode number
+ *
+ * Note, UBIFS does not have truncation keys on the media and this function is
+ * only used for purposes of replay.
+ */
+static inline void trun_key_init(const struct ubifs_info *c,
+                                 union ubifs_key *key, ino_t inum)
+{
+        key->u32[0] = inum;
+        key->u32[1] = UBIFS_TRUN_KEY << UBIFS_S_KEY_BLOCK_BITS;
+}
+/**
+ * key_type - get key type.
+ * @c: UBIFS file-system description object
+ * @key: key to get type of
+ */
+static inline int key_type(const struct ubifs_info *c,
+                           const union ubifs_key *key)
+{
+        return key->u32[1] >> UBIFS_S_KEY_BLOCK_BITS;
+}
+/**
+ * key_type_flash - get type of a on-flash formatted key.
+ * @c: UBIFS file-system description object
+ * @k: key to get type of
+ */
+static inline int key_type_flash(const struct ubifs_info *c, const void *k)
+{
+        const union ubifs_key *key = k;
+        return le32_to_cpu(key->u32[1]) >> UBIFS_S_KEY_BLOCK_BITS;
+}
+/**
+ * key_inum - fetch inode number from key.
+ * @c: UBIFS file-system description object
+ * @k: key to fetch inode number from
+ */
+static inline ino_t key_inum(const struct ubifs_info *c, const void *k)
+{
+        const union ubifs_key *key = k;
+        return key->u32[0];
+}
+/**
+ * key_inum_flash - fetch inode number from an on-flash formatted key.
+ * @c: UBIFS file-system description object
+ * @k: key to fetch inode number from
+ */
+static inline ino_t key_inum_flash(const struct ubifs_info *c, const void *k)
+{
+        const union ubifs_key *key = k;
+        return le32_to_cpu(key->j32[0]);
+}
+/**
+ * key_hash - get directory entry hash.
+ * @c: UBIFS file-system description object
+ * @key: the key to get hash from
+ */
+static inline int key_hash(const struct ubifs_info *c,
+                           const union ubifs_key *key)
+{
+        return key->u32[1] & UBIFS_S_KEY_HASH_MASK;
+}
+/**
+ * key_hash_flash - get directory entry hash from an on-flash formatted key.
+ * @c: UBIFS file-system description object
+ * @k: the key to get hash from
+ */
+static inline int key_hash_flash(const struct ubifs_info *c, const void *k)
+{
+        const union ubifs_key *key = k;
+        return le32_to_cpu(key->j32[1]) & UBIFS_S_KEY_HASH_MASK;
+}
+/**
+ * key_block - get data block number.
+ * @c: UBIFS file-system description object
+ * @key: the key to get the block number from
+ */
+static inline unsigned int key_block(const struct ubifs_info *c,
+                                     const union ubifs_key *key)
+{
+        return key->u32[1] & UBIFS_S_KEY_BLOCK_MASK;
+}
+/**
+ * key_block_flash - get data block number from an on-flash formatted key.
+ * @c: UBIFS file-system description object
+ * @k: the key to get the block number from
+ */
+static inline unsigned int key_block_flash(const struct ubifs_info *c,
+                                           const void *k)
+{
+        const union ubifs_key *key = k;
+        return le32_to_cpu(key->u32[1]) & UBIFS_S_KEY_BLOCK_MASK;
+}
+/**
+ * key_read - transform a key to in-memory format.
+ * @c: UBIFS file-system description object
+ * @from: the key to transform
+ * @to: the key to store the result
+ */
+static inline void key_read(const struct ubifs_info *c, const void *from,
+                            union ubifs_key *to)
+{
+        const union ubifs_key *f = from;
+        to->u32[0] = le32_to_cpu(f->j32[0]);
+        to->u32[1] = le32_to_cpu(f->j32[1]);
+}
+/**
+ * key_write - transform a key from in-memory format.
+ * @c: UBIFS file-system description object
+ * @from: the key to transform
+ * @to: the key to store the result
+ */
+static inline void key_write(const struct ubifs_info *c,
+                             const union ubifs_key *from, void *to)
+{
+        union ubifs_key *t = to;
+        t->j32[0] = cpu_to_le32(from->u32[0]);
+        t->j32[1] = cpu_to_le32(from->u32[1]);
+        memset(to + 8, 0, UBIFS_MAX_KEY_LEN - 8);
+}
+/**
+ * key_write_idx - transform a key from in-memory format for the index.
+ * @c: UBIFS file-system description object
+ * @from: the key to transform
+ * @to: the key to store the result
+ */
+static inline void key_write_idx(const struct ubifs_info *c,
+                                 const union ubifs_key *from, void *to)
+{
+        union ubifs_key *t = to;
+        t->j32[0] = cpu_to_le32(from->u32[0]);
+        t->j32[1] = cpu_to_le32(from->u32[1]);
+}
+/**
+ * key_copy - copy a key.
+ * @c: UBIFS file-system description object
+ * @from: the key to copy from
+ * @to: the key to copy to
+ */
+static inline void key_copy(const struct ubifs_info *c,
+                            const union ubifs_key *from, union ubifs_key *to)
+{
+        to->u64[0] = from->u64[0];
+}
+/**
+ * keys_cmp - compare keys.
+ * @c: UBIFS file-system description object
+ * @key1: the first key to compare
+ * @key2: the second key to compare
+ *
+ * This function compares 2 keys and returns %-1 if @key1 is less than
+ * @key2, 0 if the keys are equivalent and %1 if @key1 is greater than @key2.
+ */
+static inline int keys_cmp(const struct ubifs_info *c,
+                           const union ubifs_key *key1,
+                           const union ubifs_key *key2)
+{
+        if (key1->u32[0] < key2->u32[0])
+                return -1;
+        if (key1->u32[0] > key2->u32[0])
+                return 1;
+        if (key1->u32[1] < key2->u32[1])
+                return -1;
+        if (key1->u32[1] > key2->u32[1])
+                return 1;
+        return 0;
+}
+/**
+ * is_hash_key - is a key vulnerable to hash collisions.
+ * @c: UBIFS file-system description object
+ * @key: key
+ *
+ * This function returns %1 if @key is a hashed key or %0 otherwise.
+ */
+static inline int is_hash_key(const struct ubifs_info *c,
+                              const union ubifs_key *key)
+{
+        int type = key_type(c, key);
+        return type == UBIFS_DENT_KEY || type == UBIFS_XENT_KEY;
+}
+/**
+ * key_max_inode_size - get maximum file size allowed by current key format.
+ * @c: UBIFS file-system description object
+ */
+static inline unsigned long long key_max_inode_size(const struct ubifs_info *c)
+{
+        switch (c->key_fmt) {
+        case UBIFS_SIMPLE_KEY_FMT:
+                return (1ULL << UBIFS_S_KEY_BLOCK_BITS) * UBIFS_BLOCK_SIZE;
+        default:
+                return 0;
+        }
+}
+#endif /* !__UBIFS_KEY_H__ */
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
new file mode 100644
index 000000000000..36857b9ed59e
--- /dev/null
+++ b/fs/ubifs/log.c
@@ -0,0 +1,805 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+/*
+ * This file is a part of UBIFS journal implementation and contains various
+ * functions which manipulate the log. The log is a fixed area on the flash
+ * which does not contain any data but refers to buds. The log is a part of the
+ * journal.
+ */
+#include "ubifs.h"
+#ifdef CONFIG_UBIFS_FS_DEBUG
+static int dbg_check_bud_bytes(struct ubifs_info *c);
+#else
+#define dbg_check_bud_bytes(c) 0
+#endif
+/**
+ * ubifs_search_bud - search bud LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: logical eraseblock number to search
+ *
+ * This function searches bud LEB @lnum. Returns bud description object in case
+ * of success and %NULL if there is no bud with this LEB number.
+ */
+struct ubifs_bud *ubifs_search_bud(struct ubifs_info *c, int lnum)
+{
+        struct rb_node *p;
+        struct ubifs_bud *bud;
+        spin_lock(&c->buds_lock);
+        p = c->buds.rb_node;
+        while (p) {
+                bud = rb_entry(p, struct ubifs_bud, rb);
+                if (lnum < bud->lnum)
+                        p = p->rb_left;
+                else if (lnum > bud->lnum)
+                        p = p->rb_right;
+                else {
+                        spin_unlock(&c->buds_lock);
+                        return bud;
+                }
+        }
+        spin_unlock(&c->buds_lock);
+        return NULL;
+}
+/**
+ * ubifs_get_wbuf - get the wbuf associated with a LEB, if there is one.
+ * @c: UBIFS file-system description object
+ * @lnum: logical eraseblock number to search
+ *
+ * This functions returns the wbuf for @lnum or %NULL if there is not one.
+ */
+struct ubifs_wbuf *ubifs_get_wbuf(struct ubifs_info *c, int lnum)
+{
+        struct rb_node *p;
+        struct ubifs_bud *bud;
+        int jhead;
+        if (!c->jheads)
+                return NULL;
+        spin_lock(&c->buds_lock);
+        p = c->buds.rb_node;
+        while (p) {
+                bud = rb_entry(p, struct ubifs_bud, rb);
+                if (lnum < bud->lnum)
+                        p = p->rb_left;
+                else if (lnum > bud->lnum)
+                        p = p->rb_right;
+                else {
+                        jhead = bud->jhead;
+                        spin_unlock(&c->buds_lock);
+                        return &c->jheads[jhead].wbuf;
+                }
+        }
+        spin_unlock(&c->buds_lock);
+        return NULL;
+}
+/**
+ * next_log_lnum - switch to the next log LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: current log LEB
+ */
+static inline int next_log_lnum(const struct ubifs_info *c, int lnum)
+{
+        lnum += 1;
+        if (lnum > c->log_last)
+                lnum = UBIFS_LOG_LNUM;
+        return lnum;
+}
+/**
+ * empty_log_bytes - calculate amount of empty space in the log.
+ * @c: UBIFS file-system description object
+ */
+static inline long long empty_log_bytes(const struct ubifs_info *c)
+{
+        long long h, t;
+        h = (long long)c->lhead_lnum * c->leb_size + c->lhead_offs;
+        t = (long long)c->ltail_lnum * c->leb_size;
+        if (h >= t)
+                return c->log_bytes - h + t;
+        else
+                return t - h;
+}
+/**
+ * ubifs_add_bud - add bud LEB to the tree of buds and its journal head list.
+ * @c: UBIFS file-system description object
+ * @bud: the bud to add
+ */
+void ubifs_add_bud(struct ubifs_info *c, struct ubifs_bud *bud)
+{
+        struct rb_node **p, *parent = NULL;
+        struct ubifs_bud *b;
+        struct ubifs_jhead *jhead;
+        spin_lock(&c->buds_lock);
+        p = &c->buds.rb_node;
+        while (*p) {
+                parent = *p;
+                b = rb_entry(parent, struct ubifs_bud, rb);
+                ubifs_assert(bud->lnum != b->lnum);
+                if (bud->lnum < b->lnum)
+                        p = &(*p)->rb_left;
+                else
+                        p = &(*p)->rb_right;
+        }
+        rb_link_node(&bud->rb, parent, p);
+        rb_insert_color(&bud->rb, &c->buds);
+        if (c->jheads) {
+                jhead = &c->jheads[bud->jhead];
+                list_add_tail(&bud->list, &jhead->buds_list);
+        } else
+                ubifs_assert(c->replaying && (c->vfs_sb->s_flags & MS_RDONLY));
+        /*
+         * Note, although this is a new bud, we anyway account this space now,
+         * before any data has been written to it, because this is about to
+         * guarantee fixed mount time, and this bud will anyway be read and
+         * scanned.
+         */
+        c->bud_bytes += c->leb_size - bud->start;
+        dbg_log("LEB %d:%d, jhead %d, bud_bytes %lld", bud->lnum,
+                bud->start, bud->jhead, c->bud_bytes);
+        spin_unlock(&c->buds_lock);
+}
+/**
+ * ubifs_create_buds_lists - create journal head buds lists for remount rw.
+ * @c: UBIFS file-system description object
+ */
+void ubifs_create_buds_lists(struct ubifs_info *c)
+{
+        struct rb_node *p;
+        spin_lock(&c->buds_lock);
+        p = rb_first(&c->buds);
+        while (p) {
+                struct ubifs_bud *bud = rb_entry(p, struct ubifs_bud, rb);
+                struct ubifs_jhead *jhead = &c->jheads[bud->jhead];
+                list_add_tail(&bud->list, &jhead->buds_list);
+                p = rb_next(p);
+        }
+        spin_unlock(&c->buds_lock);
+}
+/**
+ * ubifs_add_bud_to_log - add a new bud to the log.
+ * @c: UBIFS file-system description object
+ * @jhead: journal head the bud belongs to
+ * @lnum: LEB number of the bud
+ * @offs: starting offset of the bud
+ *
+ * This function writes reference node for the new bud LEB @lnum it to the log,
+ * and adds it to the buds tress. It also makes sure that log size does not
+ * exceed the 'c->max_bud_bytes' limit. Returns zero in case of success,
+ * %-EAGAIN if commit is required, and a negative error codes in case of
+ * failure.
+ */
+int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
+{
+        int err;
+        struct ubifs_bud *bud;
+        struct ubifs_ref_node *ref;
+        bud = kmalloc(sizeof(struct ubifs_bud), GFP_NOFS);
+        if (!bud)
+                return -ENOMEM;
+        ref = kzalloc(c->ref_node_alsz, GFP_NOFS);
+        if (!ref) {
+                kfree(bud);
+                return -ENOMEM;
+        }
+        mutex_lock(&c->log_mutex);
+        if (c->ro_media) {
+                err = -EROFS;
+                goto out_unlock;
+        }
+        /* Make sure we have enough space in the log */
+        if (empty_log_bytes(c) - c->ref_node_alsz < c->min_log_bytes) {
+                dbg_log("not enough log space - %lld, required %d",
+                        empty_log_bytes(c), c->min_log_bytes);
+                ubifs_commit_required(c);
+                err = -EAGAIN;
+                goto out_unlock;
+        }
+        /*
+         * Make sure the the amount of space in buds will not exceed
+         * 'c->max_bud_bytes' limit, because we want to guarantee mount time
+         * limits.
+         *
+         * It is not necessary to hold @c->buds_lock when reading @c->bud_bytes
+         * because we are holding @c->log_mutex. All @c->bud_bytes take place
+         * when both @c->log_mutex and @c->bud_bytes are locked.
+         */
+        if (c->bud_bytes + c->leb_size - offs > c->max_bud_bytes) {
+                dbg_log("bud bytes %lld (%lld max), require commit",
+                        c->bud_bytes, c->max_bud_bytes);
+                ubifs_commit_required(c);
+                err = -EAGAIN;
+                goto out_unlock;
+        }
+        /*
+         * If the journal is full enough - start background commit. Note, it is
+         * OK to read 'c->cmt_state' without spinlock because integer reads
+         * are atomic in the kernel.
+         */
+        if (c->bud_bytes >= c->bg_bud_bytes &&
+            c->cmt_state == COMMIT_RESTING) {
+                dbg_log("bud bytes %lld (%lld max), initiate BG commit",
+                        c->bud_bytes, c->max_bud_bytes);
+                ubifs_request_bg_commit(c);
+        }
+        bud->lnum = lnum;
+        bud->start = offs;
+        bud->jhead = jhead;
+        ref->ch.node_type = UBIFS_REF_NODE;
+        ref->lnum = cpu_to_le32(bud->lnum);
+        ref->offs = cpu_to_le32(bud->start);
+        ref->jhead = cpu_to_le32(jhead);
+        if (c->lhead_offs > c->leb_size - c->ref_node_alsz) {
+                c->lhead_lnum = next_log_lnum(c, c->lhead_lnum);
+                c->lhead_offs = 0;
+        }
+        if (c->lhead_offs == 0) {
+                /* Must ensure next log LEB has been unmapped */
+                err = ubifs_leb_unmap(c, c->lhead_lnum);
+                if (err)
+                        goto out_unlock;
+        }
+        if (bud->start == 0) {
+                /*
+                 * Before writing the LEB reference which refers an empty LEB
+                 * to the log, we have to make sure it is mapped, because
+                 * otherwise we'd risk to refer an LEB with garbage in case of
+                 * an unclean reboot, because the target LEB might have been
+                 * unmapped, but not yet physically erased.
+                 */
+                err = ubi_leb_map(c->ubi, bud->lnum, UBI_SHORTTERM);
+                if (err)
+                        goto out_unlock;
+        }
+        dbg_log("write ref LEB %d:%d",
+                c->lhead_lnum, c->lhead_offs);
+        err = ubifs_write_node(c, ref, UBIFS_REF_NODE_SZ, c->lhead_lnum,
+                               c->lhead_offs, UBI_SHORTTERM);
+        if (err)
+                goto out_unlock;
+        c->lhead_offs += c->ref_node_alsz;
+        ubifs_add_bud(c, bud);
+        mutex_unlock(&c->log_mutex);
+        kfree(ref);
+        return 0;
+out_unlock:
+        mutex_unlock(&c->log_mutex);
+        kfree(ref);
+        kfree(bud);
+        return err;
+}
+/**
+ * remove_buds - remove used buds.
+ * @c: UBIFS file-system description object
+ *
+ * This function removes use buds from the buds tree. It does not remove the
+ * buds which are pointed to by journal heads.
+ */
+static void remove_buds(struct ubifs_info *c)
+{
+        struct rb_node *p;
+        ubifs_assert(list_empty(&c->old_buds));
+        c->cmt_bud_bytes = 0;
+        spin_lock(&c->buds_lock);
+        p = rb_first(&c->buds);
+        while (p) {
+                struct rb_node *p1 = p;
+                struct ubifs_bud *bud;
+                struct ubifs_wbuf *wbuf;
+                p = rb_next(p);
+                bud = rb_entry(p1, struct ubifs_bud, rb);
+                wbuf = &c->jheads[bud->jhead].wbuf;
+                if (wbuf->lnum == bud->lnum) {
+                        /*
+                         * Do not remove buds which are pointed to by journal
+                         * heads (non-closed buds).
+                         */
+                        c->cmt_bud_bytes += wbuf->offs - bud->start;
+                        dbg_log("preserve %d:%d, jhead %d, bud bytes %d, "
+                                "cmt_bud_bytes %lld", bud->lnum, bud->start,
+                                bud->jhead, wbuf->offs - bud->start,
+                                c->cmt_bud_bytes);
+                        bud->start = wbuf->offs;
+                } else {
+                        c->cmt_bud_bytes += c->leb_size - bud->start;
+                        dbg_log("remove %d:%d, jhead %d, bud bytes %d, "
+                                "cmt_bud_bytes %lld", bud->lnum, bud->start,
+                                bud->jhead, c->leb_size - bud->start,
+                                c->cmt_bud_bytes);
+                        rb_erase(p1, &c->buds);
+                        list_del(&bud->list);
+                        /*
+                         * If the commit does not finish, the recovery will need
+                         * to replay the journal, in which case the old buds
+                         * must be unchanged. Do not release them until post
+                         * commit i.e. do not allow them to be garbage
+                         * collected.
+                         */
+                        list_add(&bud->list, &c->old_buds);
+                }
+        }
+        spin_unlock(&c->buds_lock);
+}
+/**
+ * ubifs_log_start_commit - start commit.
+ * @c: UBIFS file-system description object
+ * @ltail_lnum: return new log tail LEB number
+ *
+ * The commit operation starts with writing "commit start" node to the log and
+ * reference nodes for all journal heads which will define new journal after
+ * the commit has been finished. The commit start and reference nodes are
+ * written in one go to the nearest empty log LEB (hence, when commit is
+ * finished UBIFS may safely unmap all the previous log LEBs). This function
+ * returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum)
+{
+        void *buf;
+        struct ubifs_cs_node *cs;
+        struct ubifs_ref_node *ref;
+        int err, i, max_len, len;
+        err = dbg_check_bud_bytes(c);
+        if (err)
+                return err;
+        max_len = UBIFS_CS_NODE_SZ + c->jhead_cnt * UBIFS_REF_NODE_SZ;
+        max_len = ALIGN(max_len, c->min_io_size);
+        buf = cs = kmalloc(max_len, GFP_NOFS);
+        if (!buf)
+                return -ENOMEM;
+        cs->ch.node_type = UBIFS_CS_NODE;
+        cs->cmt_no = cpu_to_le64(c->cmt_no + 1);
+        ubifs_prepare_node(c, cs, UBIFS_CS_NODE_SZ, 0);
+        /*
+         * Note, we do not lock 'c->log_mutex' because this is the commit start
+         * phase and we are exclusively using the log. And we do not lock
+         * write-buffer because nobody can write to the file-system at this
+         * phase.
+         */
+        len = UBIFS_CS_NODE_SZ;
+        for (i = 0; i < c->jhead_cnt; i++) {
+                int lnum = c->jheads[i].wbuf.lnum;
+                int offs = c->jheads[i].wbuf.offs;
+                if (lnum == -1 || offs == c->leb_size)
+                        continue;
+                dbg_log("add ref to LEB %d:%d for jhead %d", lnum, offs, i);
+                ref = buf + len;
+                ref->ch.node_type = UBIFS_REF_NODE;
+                ref->lnum = cpu_to_le32(lnum);
+                ref->offs = cpu_to_le32(offs);
+                ref->jhead = cpu_to_le32(i);
+                ubifs_prepare_node(c, ref, UBIFS_REF_NODE_SZ, 0);
+                len += UBIFS_REF_NODE_SZ;
+        }
+        ubifs_pad(c, buf + len, ALIGN(len, c->min_io_size) - len);
+        /* Switch to the next log LEB */
+        if (c->lhead_offs) {
+                c->lhead_lnum = next_log_lnum(c, c->lhead_lnum);
+                c->lhead_offs = 0;
+        }
+        if (c->lhead_offs == 0) {
+                /* Must ensure next LEB has been unmapped */
+                err = ubifs_leb_unmap(c, c->lhead_lnum);
+                if (err)
+                        goto out;
+        }
+        len = ALIGN(len, c->min_io_size);
+        dbg_log("writing commit start at LEB %d:0, len %d", c->lhead_lnum, len);
+        err = ubifs_leb_write(c, c->lhead_lnum, cs, 0, len, UBI_SHORTTERM);
+        if (err)
+                goto out;
+        *ltail_lnum = c->lhead_lnum;
+        c->lhead_offs += len;
+        if (c->lhead_offs == c->leb_size) {
+                c->lhead_lnum = next_log_lnum(c, c->lhead_lnum);
+                c->lhead_offs = 0;
+        }
+        remove_buds(c);
+        /*
+         * We have started the commit and now users may use the rest of the log
+         * for new writes.
+         */
+        c->min_log_bytes = 0;
+out:
+        kfree(buf);
+        return err;
+}
+/**
+ * ubifs_log_end_commit - end commit.
+ * @c: UBIFS file-system description object
+ * @ltail_lnum: new log tail LEB number
+ *
+ * This function is called on when the commit operation was finished. It
+ * moves log tail to new position and unmaps LEBs which contain obsolete data.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+int ubifs_log_end_commit(struct ubifs_info *c, int ltail_lnum)
+{
+        int err;
+        /*
+         * At this phase we have to lock 'c->log_mutex' because UBIFS allows FS
+         * writes during commit. Its only short "commit" start phase when
+         * writers are blocked.
+         */
+        mutex_lock(&c->log_mutex);
+        dbg_log("old tail was LEB %d:0, new tail is LEB %d:0",
+                c->ltail_lnum, ltail_lnum);
+        c->ltail_lnum = ltail_lnum;
+        /*
+         * The commit is finished and from now on it must be guaranteed that
+         * there is always enough space for the next commit.
+         */
+        c->min_log_bytes = c->leb_size;
+        spin_lock(&c->buds_lock);
+        c->bud_bytes -= c->cmt_bud_bytes;
+        spin_unlock(&c->buds_lock);
+        err = dbg_check_bud_bytes(c);
+        mutex_unlock(&c->log_mutex);
+        return err;
+}
+/**
+ * ubifs_log_post_commit - things to do after commit is completed.
+ * @c: UBIFS file-system description object
+ * @old_ltail_lnum: old log tail LEB number
+ *
+ * Release buds only after commit is completed, because they must be unchanged
+ * if recovery is needed.
+ *
+ * Unmap log LEBs only after commit is completed, because they may be needed for
+ * recovery.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_log_post_commit(struct ubifs_info *c, int old_ltail_lnum)
+{
+        int lnum, err = 0;
+        while (!list_empty(&c->old_buds)) {
+                struct ubifs_bud *bud;
+                bud = list_entry(c->old_buds.next, struct ubifs_bud, list);
+                err = ubifs_return_leb(c, bud->lnum);
+                if (err)
+                        return err;
+                list_del(&bud->list);
+                kfree(bud);
+        }
+        mutex_lock(&c->log_mutex);
+        for (lnum = old_ltail_lnum; lnum != c->ltail_lnum;
+             lnum = next_log_lnum(c, lnum)) {
+                dbg_log("unmap log LEB %d", lnum);
+                err = ubifs_leb_unmap(c, lnum);
+                if (err)
+                        goto out;
+        }
+out:
+        mutex_unlock(&c->log_mutex);
+        return err;
+}
+/**
+ * struct done_ref - references that have been done.
+ * @rb: rb-tree node
+ * @lnum: LEB number
+ */
+struct done_ref {
+        struct rb_node rb;
+        int lnum;
+};
+/**
+ * done_already - determine if a reference has been done already.
+ * @done_tree: rb-tree to store references that have been done
+ * @lnum: LEB number of reference
+ *
+ * This function returns %1 if the reference has been done, %0 if not, otherwise
+ * a negative error code is returned.
+ */
+static int done_already(struct rb_root *done_tree, int lnum)
+{
+        struct rb_node **p = &done_tree->rb_node, *parent = NULL;
+        struct done_ref *dr;
+        while (*p) {
+                parent = *p;
+                dr = rb_entry(parent, struct done_ref, rb);
+                if (lnum < dr->lnum)
+                        p = &(*p)->rb_left;
+                else if (lnum > dr->lnum)
+                        p = &(*p)->rb_right;
+                else
+                        return 1;
+        }
+        dr = kzalloc(sizeof(struct done_ref), GFP_NOFS);
+        if (!dr)
+                return -ENOMEM;
+        dr->lnum = lnum;
+        rb_link_node(&dr->rb, parent, p);
+        rb_insert_color(&dr->rb, done_tree);
+        return 0;
+}
+/**
+ * destroy_done_tree - destroy the done tree.
+ * @done_tree: done tree to destroy
+ */
+static void destroy_done_tree(struct rb_root *done_tree)
+{
+        struct rb_node *this = done_tree->rb_node;
+        struct done_ref *dr;
+        while (this) {
+                if (this->rb_left) {
+                        this = this->rb_left;
+                        continue;
+                } else if (this->rb_right) {
+                        this = this->rb_right;
+                        continue;
+                }
+                dr = rb_entry(this, struct done_ref, rb);
+                this = rb_parent(this);
+                if (this) {
+                        if (this->rb_left == &dr->rb)
+                                this->rb_left = NULL;
+                        else
+                                this->rb_right = NULL;
+                }
+                kfree(dr);
+        }
+}
+/**
+ * add_node - add a node to the consolidated log.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to which to add
+ * @lnum: LEB number to which to write is passed and returned here
+ * @offs: offset to where to write is passed and returned here
+ * @node: node to add
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int add_node(struct ubifs_info *c, void *buf, int *lnum, int *offs,
+                    void *node)
+{
+        struct ubifs_ch *ch = node;
+        int len = le32_to_cpu(ch->len), remains = c->leb_size - *offs;
+        if (len > remains) {
+                int sz = ALIGN(*offs, c->min_io_size), err;
+                ubifs_pad(c, buf + *offs, sz - *offs);
+                err = ubifs_leb_change(c, *lnum, buf, sz, UBI_SHORTTERM);
+                if (err)
+                        return err;
+                *lnum = next_log_lnum(c, *lnum);
+                *offs = 0;
+        }
+        memcpy(buf + *offs, node, len);
+        *offs += ALIGN(len, 8);
+        return 0;
+}
+/**
+ * ubifs_consolidate_log - consolidate the log.
+ * @c: UBIFS file-system description object
+ *
+ * Repeated failed commits could cause the log to be full, but at least 1 LEB is
+ * needed for commit. This function rewrites the reference nodes in the log
+ * omitting duplicates, and failed CS nodes, and leaving no gaps.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_consolidate_log(struct ubifs_info *c)
+{
+        struct ubifs_scan_leb *sleb;
+        struct ubifs_scan_node *snod;
+        struct rb_root done_tree = RB_ROOT;
+        int lnum, err, first = 1, write_lnum, offs = 0;
+        void *buf;
+        dbg_rcvry("log tail LEB %d, log head LEB %d", c->ltail_lnum,
+                  c->lhead_lnum);
+        buf = vmalloc(c->leb_size);
+        if (!buf)
+                return -ENOMEM;
+        lnum = c->ltail_lnum;
+        write_lnum = lnum;
+        while (1) {
+                sleb = ubifs_scan(c, lnum, 0, c->sbuf);
+                if (IS_ERR(sleb)) {
+                        err = PTR_ERR(sleb);
+                        goto out_free;
+                }
+                list_for_each_entry(snod, &sleb->nodes, list) {
+                        switch (snod->type) {
+                        case UBIFS_REF_NODE: {
+                                struct ubifs_ref_node *ref = snod->node;
+                                int ref_lnum = le32_to_cpu(ref->lnum);
+                                err = done_already(&done_tree, ref_lnum);
+                                if (err < 0)
+                                        goto out_scan;
+                                if (err != 1) {
+                                        err = add_node(c, buf, &write_lnum,
+                                                       &offs, snod->node);
+                                        if (err)
+                                                goto out_scan;
+                                }
+                                break;
+                        }
+                        case UBIFS_CS_NODE:
+                                if (!first)
+                                        break;
+                                err = add_node(c, buf, &write_lnum, &offs,
+                                               snod->node);
+                                if (err)
+                                        goto out_scan;
+                                first = 0;
+                                break;
+                        }
+                }
+                ubifs_scan_destroy(sleb);
+                if (lnum == c->lhead_lnum)
+                        break;
+                lnum = next_log_lnum(c, lnum);
+        }
+        if (offs) {
+                int sz = ALIGN(offs, c->min_io_size);
+                ubifs_pad(c, buf + offs, sz - offs);
+                err = ubifs_leb_change(c, write_lnum, buf, sz, UBI_SHORTTERM);
+                if (err)
+                        goto out_free;
+                offs = ALIGN(offs, c->min_io_size);
+        }
+        destroy_done_tree(&done_tree);
+        vfree(buf);
+        if (write_lnum == c->lhead_lnum) {
+                ubifs_err("log is too full");
+                return -EINVAL;
+        }
+        /* Unmap remaining LEBs */
+        lnum = write_lnum;
+        do {
+                lnum = next_log_lnum(c, lnum);
+                err = ubifs_leb_unmap(c, lnum);
+                if (err)
+                        return err;
+        } while (lnum != c->lhead_lnum);
+        c->lhead_lnum = write_lnum;
+        c->lhead_offs = offs;
+        dbg_rcvry("new log head at %d:%d", c->lhead_lnum, c->lhead_offs);
+        return 0;
+out_scan:
+        ubifs_scan_destroy(sleb);
+out_free:
+        destroy_done_tree(&done_tree);
+        vfree(buf);
+        return err;
+}
+#ifdef CONFIG_UBIFS_FS_DEBUG
+/**
+ * dbg_check_bud_bytes - make sure bud bytes calculation are all right.
+ * @c: UBIFS file-system description object
+ *
+ * This function makes sure the amount of flash space used by closed buds
+ * ('c->bud_bytes' is correct). Returns zero in case of success and %-EINVAL in
+ * case of failure.
+ */
+static int dbg_check_bud_bytes(struct ubifs_info *c)
+{
+        int i, err = 0;
+        struct ubifs_bud *bud;
+        long long bud_bytes = 0;
+        if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+                return 0;
+        spin_lock(&c->buds_lock);
+        for (i = 0; i < c->jhead_cnt; i++)
+                list_for_each_entry(bud, &c->jheads[i].buds_list, list)
+                        bud_bytes += c->leb_size - bud->start;
+        if (c->bud_bytes != bud_bytes) {
+                ubifs_err("bad bud_bytes %lld, calculated %lld",
+                          c->bud_bytes, bud_bytes);
+                err = -EINVAL;
+        }
+        spin_unlock(&c->buds_lock);
+        return err;
+}
+#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
new file mode 100644
index 000000000000..2ba93da71b65
--- /dev/null
+++ b/fs/ubifs/lprops.c
@@ -0,0 +1,1357 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+/*
+ * This file implements the functions that access LEB properties and their
+ * categories. LEBs are categorized based on the needs of UBIFS, and the
+ * categories are stored as either heaps or lists to provide a fast way of
+ * finding a LEB in a particular category. For example, UBIFS may need to find
+ * an empty LEB for the journal, or a very dirty LEB for garbage collection.
+ */
+#include "ubifs.h"
+/**
+ * get_heap_comp_val - get the LEB properties value for heap comparisons.
+ * @lprops: LEB properties
+ * @cat: LEB category
+ */
+static int get_heap_comp_val(struct ubifs_lprops *lprops, int cat)
+{
+        switch (cat) {
+        case LPROPS_FREE:
+                return lprops->free;
+        case LPROPS_DIRTY_IDX:
+                return lprops->free + lprops->dirty;
+        default:
+                return lprops->dirty;
+        }
+}
+/**
+ * move_up_lpt_heap - move a new heap entry up as far as possible.
+ * @c: UBIFS file-system description object
+ * @heap: LEB category heap
+ * @lprops: LEB properties to move
+ * @cat: LEB category
+ *
+ * New entries to a heap are added at the bottom and then moved up until the
+ * parent's value is greater.  In the case of LPT's category heaps, the value
+ * is either the amount of free space or the amount of dirty space, depending
+ * on the category.
+ */
+static void move_up_lpt_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap,
+                             struct ubifs_lprops *lprops, int cat)
+{
+        int val1, val2, hpos;
+        hpos = lprops->hpos;
+        if (!hpos)
+                return; /* Already top of the heap */
+        val1 = get_heap_comp_val(lprops, cat);
+        /* Compare to parent and, if greater, move up the heap */
+        do {
+                int ppos = (hpos - 1) / 2;
+                val2 = get_heap_comp_val(heap->arr[ppos], cat);
+                if (val2 >= val1)
+                        return;
+                /* Greater than parent so move up */
+                heap->arr[ppos]->hpos = hpos;
+                heap->arr[hpos] = heap->arr[ppos];
+                heap->arr[ppos] = lprops;
+                lprops->hpos = ppos;
+                hpos = ppos;
+        } while (hpos);
+}
+/**
+ * adjust_lpt_heap - move a changed heap entry up or down the heap.
+ * @c: UBIFS file-system description object
+ * @heap: LEB category heap
+ * @lprops: LEB properties to move
+ * @hpos: heap position of @lprops
+ * @cat: LEB category
+ *
+ * Changed entries in a heap are moved up or down until the parent's value is
+ * greater.  In the case of LPT's category heaps, the value is either the amount
+ * of free space or the amount of dirty space, depending on the category.
+ */
+static void adjust_lpt_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap,
+                            struct ubifs_lprops *lprops, int hpos, int cat)
+{
+        int val1, val2, val3, cpos;
+        val1 = get_heap_comp_val(lprops, cat);
+        /* Compare to parent and, if greater than parent, move up the heap */
+        if (hpos) {
+                int ppos = (hpos - 1) / 2;
+                val2 = get_heap_comp_val(heap->arr[ppos], cat);
+                if (val1 > val2) {
+                        /* Greater than parent so move up */
+                        while (1) {
+                                heap->arr[ppos]->hpos = hpos;
+                                heap->arr[hpos] = heap->arr[ppos];
+                                heap->arr[ppos] = lprops;
+                                lprops->hpos = ppos;
+                                hpos = ppos;
+                                if (!hpos)
+                                        return;
+                                ppos = (hpos - 1) / 2;
+                                val2 = get_heap_comp_val(heap->arr[ppos], cat);
+                                if (val1 <= val2)
+                                        return;
+                                /* Still greater than parent so keep going */
+                        }
+                }
+        }
+        /* Not greater than parent, so compare to children */
+        while (1) {
+                /* Compare to left child */
+                cpos = hpos * 2 + 1;
+                if (cpos >= heap->cnt)
+                        return;
+                val2 = get_heap_comp_val(heap->arr[cpos], cat);
+                if (val1 < val2) {
+                        /* Less than left child, so promote biggest child */
+                        if (cpos + 1 < heap->cnt) {
+                                val3 = get_heap_comp_val(heap->arr[cpos + 1],
+                                                         cat);
+                                if (val3 > val2)
+                                        cpos += 1; /* Right child is bigger */
+                        }
+                        heap->arr[cpos]->hpos = hpos;
+                        heap->arr[hpos] = heap->arr[cpos];
+                        heap->arr[cpos] = lprops;
+                        lprops->hpos = cpos;
+                        hpos = cpos;
+                        continue;
+                }
+                /* Compare to right child */
+                cpos += 1;
+                if (cpos >= heap->cnt)
+                        return;
+                val3 = get_heap_comp_val(heap->arr[cpos], cat);
+                if (val1 < val3) {
+                        /* Less than right child, so promote right child */
+                        heap->arr[cpos]->hpos = hpos;
+                        heap->arr[hpos] = heap->arr[cpos];
+                        heap->arr[cpos] = lprops;
+                        lprops->hpos = cpos;
+                        hpos = cpos;
+                        continue;
+                }
+                return;
+        }
+}
+/**
+ * add_to_lpt_heap - add LEB properties to a LEB category heap.
+ * @c: UBIFS file-system description object
+ * @lprops: LEB properties to add
+ * @cat: LEB category
+ *
+ * This function returns %1 if @lprops is added to the heap for LEB category
+ * @cat, otherwise %0 is returned because the heap is full.
+ */
+static int add_to_lpt_heap(struct ubifs_info *c, struct ubifs_lprops *lprops,
+                           int cat)
+{
+        struct ubifs_lpt_heap *heap = &c->lpt_heap[cat - 1];
+        if (heap->cnt >= heap->max_cnt) {
+                const int b = LPT_HEAP_SZ / 2 - 1;
+                int cpos, val1, val2;
+                /* Compare to some other LEB on the bottom of heap */
+                /* Pick a position kind of randomly */
+                cpos = (((size_t)lprops >> 4) & b) + b;
+                ubifs_assert(cpos >= b);
+                ubifs_assert(cpos < LPT_HEAP_SZ);
+                ubifs_assert(cpos < heap->cnt);
+                val1 = get_heap_comp_val(lprops, cat);
+                val2 = get_heap_comp_val(heap->arr[cpos], cat);
+                if (val1 > val2) {
+                        struct ubifs_lprops *lp;
+                        lp = heap->arr[cpos];
+                        lp->flags &= ~LPROPS_CAT_MASK;
+                        lp->flags |= LPROPS_UNCAT;
+                        list_add(&lp->list, &c->uncat_list);
+                        lprops->hpos = cpos;
+                        heap->arr[cpos] = lprops;
+                        move_up_lpt_heap(c, heap, lprops, cat);
+                        dbg_check_heap(c, heap, cat, lprops->hpos);
+                        return 1; /* Added to heap */
+                }
+                dbg_check_heap(c, heap, cat, -1);
+                return 0; /* Not added to heap */
+        } else {
+                lprops->hpos = heap->cnt++;
+                heap->arr[lprops->hpos] = lprops;
+                move_up_lpt_heap(c, heap, lprops, cat);
+                dbg_check_heap(c, heap, cat, lprops->hpos);
+                return 1; /* Added to heap */
+        }
+}
+/**
+ * remove_from_lpt_heap - remove LEB properties from a LEB category heap.
+ * @c: UBIFS file-system description object
+ * @lprops: LEB properties to remove
+ * @cat: LEB category
+ */
+static void remove_from_lpt_heap(struct ubifs_info *c,
+                                 struct ubifs_lprops *lprops, int cat)
+{
+        struct ubifs_lpt_heap *heap;
+        int hpos = lprops->hpos;
+        heap = &c->lpt_heap[cat - 1];
+        ubifs_assert(hpos >= 0 && hpos < heap->cnt);
+        ubifs_assert(heap->arr[hpos] == lprops);
+        heap->cnt -= 1;
+        if (hpos < heap->cnt) {
+                heap->arr[hpos] = heap->arr[heap->cnt];
+                heap->arr[hpos]->hpos = hpos;
+                adjust_lpt_heap(c, heap, heap->arr[hpos], hpos, cat);
+        }
+        dbg_check_heap(c, heap, cat, -1);
+}
+/**
+ * lpt_heap_replace - replace lprops in a category heap.
+ * @c: UBIFS file-system description object
+ * @old_lprops: LEB properties to replace
+ * @new_lprops: LEB properties with which to replace
+ * @cat: LEB category
+ *
+ * During commit it is sometimes necessary to copy a pnode (see dirty_cow_pnode)
+ * and the lprops that the pnode contains.  When that happens, references in
+ * the category heaps to those lprops must be updated to point to the new
+ * lprops.  This function does that.
+ */
+static void lpt_heap_replace(struct ubifs_info *c,
+                             struct ubifs_lprops *old_lprops,
+                             struct ubifs_lprops *new_lprops, int cat)
+{
+        struct ubifs_lpt_heap *heap;
+        int hpos = new_lprops->hpos;
+        heap = &c->lpt_heap[cat - 1];
+        heap->arr[hpos] = new_lprops;
+}
+/**
+ * ubifs_add_to_cat - add LEB properties to a category list or heap.
+ * @c: UBIFS file-system description object
+ * @lprops: LEB properties to add
+ * @cat: LEB category to which to add
+ *
+ * LEB properties are categorized to enable fast find operations.
+ */
+void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops,
+                      int cat)
+{
+        switch (cat) {
+        case LPROPS_DIRTY:
+        case LPROPS_DIRTY_IDX:
+        case LPROPS_FREE:
+                if (add_to_lpt_heap(c, lprops, cat))
+                        break;
+                /* No more room on heap so make it uncategorized */
+                cat = LPROPS_UNCAT;
+                /* Fall through */
+        case LPROPS_UNCAT:
+                list_add(&lprops->list, &c->uncat_list);
+                break;
+        case LPROPS_EMPTY:
+                list_add(&lprops->list, &c->empty_list);
+                break;
+        case LPROPS_FREEABLE:
+                list_add(&lprops->list, &c->freeable_list);
+                c->freeable_cnt += 1;
+                break;
+        case LPROPS_FRDI_IDX:
+                list_add(&lprops->list, &c->frdi_idx_list);
+                break;
+        default:
+                ubifs_assert(0);
+        }
+        lprops->flags &= ~LPROPS_CAT_MASK;
+        lprops->flags |= cat;
+}
+/**
+ * ubifs_remove_from_cat - remove LEB properties from a category list or heap.
+ * @c: UBIFS file-system description object
+ * @lprops: LEB properties to remove
+ * @cat: LEB category from which to remove
+ *
+ * LEB properties are categorized to enable fast find operations.
+ */
+static void ubifs_remove_from_cat(struct ubifs_info *c,
+                                  struct ubifs_lprops *lprops, int cat)
+{
+        switch (cat) {
+        case LPROPS_DIRTY:
+        case LPROPS_DIRTY_IDX:
+        case LPROPS_FREE:
+                remove_from_lpt_heap(c, lprops, cat);
+                break;
+        case LPROPS_FREEABLE:
+                c->freeable_cnt -= 1;
+                ubifs_assert(c->freeable_cnt >= 0);
+                /* Fall through */
+        case LPROPS_UNCAT:
+        case LPROPS_EMPTY:
+        case LPROPS_FRDI_IDX:
+                ubifs_assert(!list_empty(&lprops->list));
+                list_del(&lprops->list);
+                break;
+        default:
+                ubifs_assert(0);
+        }
+}
+/**
+ * ubifs_replace_cat - replace lprops in a category list or heap.
+ * @c: UBIFS file-system description object
+ * @old_lprops: LEB properties to replace
+ * @new_lprops: LEB properties with which to replace
+ *
+ * During commit it is sometimes necessary to copy a pnode (see dirty_cow_pnode)
+ * and the lprops that the pnode contains. When that happens, references in
+ * category lists and heaps must be replaced. This function does that.
+ */
+void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops,
+                       struct ubifs_lprops *new_lprops)
+{
+        int cat;
+        cat = new_lprops->flags & LPROPS_CAT_MASK;
+        switch (cat) {
+        case LPROPS_DIRTY:
+        case LPROPS_DIRTY_IDX:
+        case LPROPS_FREE:
+                lpt_heap_replace(c, old_lprops, new_lprops, cat);
+                break;
+        case LPROPS_UNCAT:
+        case LPROPS_EMPTY:
+        case LPROPS_FREEABLE:
+        case LPROPS_FRDI_IDX:
+                list_replace(&old_lprops->list, &new_lprops->list);
+                break;
+        default:
+                ubifs_assert(0);
+        }
+}
+/**
+ * ubifs_ensure_cat - ensure LEB properties are categorized.
+ * @c: UBIFS file-system description object
+ * @lprops: LEB properties
+ *
+ * A LEB may have fallen off of the bottom of a heap, and ended up as
+ * uncategorized even though it has enough space for us now. If that is the case
+ * this function will put the LEB back onto a heap.
+ */
+void ubifs_ensure_cat(struct ubifs_info *c, struct ubifs_lprops *lprops)
+{
+        int cat = lprops->flags & LPROPS_CAT_MASK;
+        if (cat != LPROPS_UNCAT)
+                return;
+        cat = ubifs_categorize_lprops(c, lprops);
+        if (cat == LPROPS_UNCAT)
+                return;
+        ubifs_remove_from_cat(c, lprops, LPROPS_UNCAT);
+        ubifs_add_to_cat(c, lprops, cat);
+}
+/**
+ * ubifs_categorize_lprops - categorize LEB properties.
+ * @c: UBIFS file-system description object
+ * @lprops: LEB properties to categorize
+ *
+ * LEB properties are categorized to enable fast find operations. This function
+ * returns the LEB category to which the LEB properties belong. Note however
+ * that if the LEB category is stored as a heap and the heap is full, the
+ * LEB properties may have their category changed to %LPROPS_UNCAT.
+ */
+int ubifs_categorize_lprops(const struct ubifs_info *c,
+                            const struct ubifs_lprops *lprops)
+{
+        if (lprops->flags & LPROPS_TAKEN)
+                return LPROPS_UNCAT;
+        if (lprops->free == c->leb_size) {
+                ubifs_assert(!(lprops->flags & LPROPS_INDEX));
+                return LPROPS_EMPTY;
+        }
+        if (lprops->free + lprops->dirty == c->leb_size) {
+                if (lprops->flags & LPROPS_INDEX)
+                        return LPROPS_FRDI_IDX;
+                else
+                        return LPROPS_FREEABLE;
+        }
+        if (lprops->flags & LPROPS_INDEX) {
+                if (lprops->dirty + lprops->free >= c->min_idx_node_sz)
+                        return LPROPS_DIRTY_IDX;
+        } else {
+                if (lprops->dirty >= c->dead_wm &&
+                    lprops->dirty > lprops->free)
+                        return LPROPS_DIRTY;
+                if (lprops->free > 0)
+                        return LPROPS_FREE;
+        }
+        return LPROPS_UNCAT;
+}
+/**
+ * change_category - change LEB properties category.
+ * @c: UBIFS file-system description object
+ * @lprops: LEB properties to recategorize
+ *
+ * LEB properties are categorized to enable fast find operations. When the LEB
+ * properties change they must be recategorized.
+ */
+static void change_category(struct ubifs_info *c, struct ubifs_lprops *lprops)
+{
+        int old_cat = lprops->flags & LPROPS_CAT_MASK;
+        int new_cat = ubifs_categorize_lprops(c, lprops);
+        if (old_cat == new_cat) {
+                struct ubifs_lpt_heap *heap = &c->lpt_heap[new_cat - 1];
+                /* lprops on a heap now must be moved up or down */
+                if (new_cat < 1 || new_cat > LPROPS_HEAP_CNT)
+                        return; /* Not on a heap */
+                heap = &c->lpt_heap[new_cat - 1];
+                adjust_lpt_heap(c, heap, lprops, lprops->hpos, new_cat);
+        } else {
+                ubifs_remove_from_cat(c, lprops, old_cat);
+                ubifs_add_to_cat(c, lprops, new_cat);
+        }
+}
+/**
+ * ubifs_get_lprops - get reference to LEB properties.
+ * @c: the UBIFS file-system description object
+ *
+ * This function locks lprops. Lprops have to be unlocked by
+ * 'ubifs_release_lprops()'.
+ */
+void ubifs_get_lprops(struct ubifs_info *c)
+{
+        mutex_lock(&c->lp_mutex);
+}
+/**
+ * calc_dark - calculate LEB dark space size.
+ * @c: the UBIFS file-system description object
+ * @spc: amount of free and dirty space in the LEB
+ *
+ * This function calculates amount of dark space in an LEB which has @spc bytes
+ * of free and dirty space. Returns the calculations result.
+ *
+ * Dark space is the space which is not always usable - it depends on which
+ * nodes are written in which order. E.g., if an LEB has only 512 free bytes,
+ * it is dark space, because it cannot fit a large data node. So UBIFS cannot
+ * count on this LEB and treat these 512 bytes as usable because it is not true
+ * if, for example, only big chunks of uncompressible data will be written to
+ * the FS.
+ */
+static int calc_dark(struct ubifs_info *c, int spc)
+{
+        ubifs_assert(!(spc & 7));
+        if (spc < c->dark_wm)
+                return spc;
+        /*
+         * If we have slightly more space then the dark space watermark, we can
+         * anyway safely assume it we'll be able to write a node of the
+         * smallest size there.
+         */
+        if (spc - c->dark_wm < MIN_WRITE_SZ)
+                return spc - MIN_WRITE_SZ;
+        return c->dark_wm;
+}
+/**
+ * is_lprops_dirty - determine if LEB properties are dirty.
+ * @c: the UBIFS file-system description object
+ * @lprops: LEB properties to test
+ */
+static int is_lprops_dirty(struct ubifs_info *c, struct ubifs_lprops *lprops)
+{
+        struct ubifs_pnode *pnode;
+        int pos;
+        pos = (lprops->lnum - c->main_first) & (UBIFS_LPT_FANOUT - 1);
+        pnode = (struct ubifs_pnode *)container_of(lprops - pos,
+                                                   struct ubifs_pnode,
+                                                   lprops[0]);
+        return !test_bit(COW_ZNODE, &pnode->flags) &&
+               test_bit(DIRTY_CNODE, &pnode->flags);
+}
+/**
+ * ubifs_change_lp - change LEB properties.
+ * @c: the UBIFS file-system description object
+ * @lp: LEB properties to change
+ * @free: new free space amount
+ * @dirty: new dirty space amount
+ * @flags: new flags
+ * @idx_gc_cnt: change to the count of idx_gc list
+ *
+ * This function changes LEB properties. This function does not change a LEB
+ * property (@free, @dirty or @flag) if the value passed is %LPROPS_NC.
+ *
+ * This function returns a pointer to the updated LEB properties on success
+ * and a negative error code on failure. N.B. the LEB properties may have had to
+ * be copied (due to COW) and consequently the pointer returned may not be the
+ * same as the pointer passed.
+ */
+const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
+                                           const struct ubifs_lprops *lp,
+                                           int free, int dirty, int flags,
+                                           int idx_gc_cnt)
+{
+        /*
+         * This is the only function that is allowed to change lprops, so we
+         * discard the const qualifier.
+         */
+        struct ubifs_lprops *lprops = (struct ubifs_lprops *)lp;
+        dbg_lp("LEB %d, free %d, dirty %d, flags %d",
+               lprops->lnum, free, dirty, flags);
+        ubifs_assert(mutex_is_locked(&c->lp_mutex));
+        ubifs_assert(c->lst.empty_lebs >= 0 &&
+                     c->lst.empty_lebs <= c->main_lebs);
+        ubifs_assert(c->freeable_cnt >= 0);
+        ubifs_assert(c->freeable_cnt <= c->main_lebs);
+        ubifs_assert(c->lst.taken_empty_lebs >= 0);
+        ubifs_assert(c->lst.taken_empty_lebs <= c->lst.empty_lebs);
+        ubifs_assert(!(c->lst.total_free & 7) && !(c->lst.total_dirty & 7));
+        ubifs_assert(!(c->lst.total_dead & 7) && !(c->lst.total_dark & 7));
+        ubifs_assert(!(c->lst.total_used & 7));
+        ubifs_assert(free == LPROPS_NC || free >= 0);
+        ubifs_assert(dirty == LPROPS_NC || dirty >= 0);
+        if (!is_lprops_dirty(c, lprops)) {
+                lprops = ubifs_lpt_lookup_dirty(c, lprops->lnum);
+                if (IS_ERR(lprops))
+                        return lprops;
+        } else
+                ubifs_assert(lprops == ubifs_lpt_lookup_dirty(c, lprops->lnum));
+        ubifs_assert(!(lprops->free & 7) && !(lprops->dirty & 7));
+        spin_lock(&c->space_lock);
+        if ((lprops->flags & LPROPS_TAKEN) && lprops->free == c->leb_size)
+                c->lst.taken_empty_lebs -= 1;
+        if (!(lprops->flags & LPROPS_INDEX)) {
+                int old_spc;
+                old_spc = lprops->free + lprops->dirty;
+                if (old_spc < c->dead_wm)
+                        c->lst.total_dead -= old_spc;
+                else
+                        c->lst.total_dark -= calc_dark(c, old_spc);
+                c->lst.total_used -= c->leb_size - old_spc;
+        }
+        if (free != LPROPS_NC) {
+                free = ALIGN(free, 8);
+                c->lst.total_free += free - lprops->free;
+                /* Increase or decrease empty LEBs counter if needed */
+                if (free == c->leb_size) {
+                        if (lprops->free != c->leb_size)
+                                c->lst.empty_lebs += 1;
+                } else if (lprops->free == c->leb_size)
+                        c->lst.empty_lebs -= 1;
+                lprops->free = free;
+        }
+        if (dirty != LPROPS_NC) {
+                dirty = ALIGN(dirty, 8);
+                c->lst.total_dirty += dirty - lprops->dirty;
+                lprops->dirty = dirty;
+        }
+        if (flags != LPROPS_NC) {
+                /* Take care about indexing LEBs counter if needed */
+                if ((lprops->flags & LPROPS_INDEX)) {
+                        if (!(flags & LPROPS_INDEX))
+                                c->lst.idx_lebs -= 1;
+                } else if (flags & LPROPS_INDEX)
+                        c->lst.idx_lebs += 1;
+                lprops->flags = flags;
+        }
+        if (!(lprops->flags & LPROPS_INDEX)) {
+                int new_spc;
+                new_spc = lprops->free + lprops->dirty;
+                if (new_spc < c->dead_wm)
+                        c->lst.total_dead += new_spc;
+                else
+                        c->lst.total_dark += calc_dark(c, new_spc);
+                c->lst.total_used += c->leb_size - new_spc;
+        }
+        if ((lprops->flags & LPROPS_TAKEN) && lprops->free == c->leb_size)
+                c->lst.taken_empty_lebs += 1;
+        change_category(c, lprops);
+        c->idx_gc_cnt += idx_gc_cnt;
+        spin_unlock(&c->space_lock);
+        return lprops;
+}
+/**
+ * ubifs_release_lprops - release lprops lock.
+ * @c: the UBIFS file-system description object
+ *
+ * This function has to be called after each 'ubifs_get_lprops()' call to
+ * unlock lprops.
+ */
+void ubifs_release_lprops(struct ubifs_info *c)
+{
+        ubifs_assert(mutex_is_locked(&c->lp_mutex));
+        ubifs_assert(c->lst.empty_lebs >= 0 &&
+                     c->lst.empty_lebs <= c->main_lebs);
+        mutex_unlock(&c->lp_mutex);
+}
+/**
+ * ubifs_get_lp_stats - get lprops statistics.
+ * @c: UBIFS file-system description object
+ * @st: return statistics
+ */
+void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *st)
+{
+        spin_lock(&c->space_lock);
+        memcpy(st, &c->lst, sizeof(struct ubifs_lp_stats));
+        spin_unlock(&c->space_lock);
+}
+/**
+ * ubifs_change_one_lp - change LEB properties.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB to change properties for
+ * @free: amount of free space
+ * @dirty: amount of dirty space
+ * @flags_set: flags to set
+ * @flags_clean: flags to clean
+ * @idx_gc_cnt: change to the count of idx_gc list
+ *
+ * This function changes properties of LEB @lnum. It is a helper wrapper over
+ * 'ubifs_change_lp()' which hides lprops get/release. The arguments are the
+ * same as in case of 'ubifs_change_lp()'. Returns zero in case of success and
+ * a negative error code in case of failure.
+ */
+int ubifs_change_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
+                        int flags_set, int flags_clean, int idx_gc_cnt)
+{
+        int err = 0, flags;
+        const struct ubifs_lprops *lp;
+        ubifs_get_lprops(c);
+        lp = ubifs_lpt_lookup_dirty(c, lnum);
+        if (IS_ERR(lp)) {
+                err = PTR_ERR(lp);
+                goto out;
+        }
+        flags = (lp->flags | flags_set) & ~flags_clean;
+        lp = ubifs_change_lp(c, lp, free, dirty, flags, idx_gc_cnt);
+        if (IS_ERR(lp))
+                err = PTR_ERR(lp);
+out:
+        ubifs_release_lprops(c);
+        return err;
+}
+/**
+ * ubifs_update_one_lp - update LEB properties.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB to change properties for
+ * @free: amount of free space
+ * @dirty: amount of dirty space to add
+ * @flags_set: flags to set
+ * @flags_clean: flags to clean
+ *
+ * This function is the same as 'ubifs_change_one_lp()' but @dirty is added to
+ * current dirty space, not substitutes it.
+ */
+int ubifs_update_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
+                        int flags_set, int flags_clean)
+{
+        int err = 0, flags;
+        const struct ubifs_lprops *lp;
+        ubifs_get_lprops(c);
+        lp = ubifs_lpt_lookup_dirty(c, lnum);
+        if (IS_ERR(lp)) {
+                err = PTR_ERR(lp);
+                goto out;
+        }
+        flags = (lp->flags | flags_set) & ~flags_clean;
+        lp = ubifs_change_lp(c, lp, free, lp->dirty + dirty, flags, 0);
+        if (IS_ERR(lp))
+                err = PTR_ERR(lp);
+out:
+        ubifs_release_lprops(c);
+        return err;
+}
+/**
+ * ubifs_read_one_lp - read LEB properties.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB to read properties for
+ * @lp: where to store read properties
+ *
+ * This helper function reads properties of a LEB @lnum and stores them in @lp.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+int ubifs_read_one_lp(struct ubifs_info *c, int lnum, struct ubifs_lprops *lp)
+{
+        int err = 0;
+        const struct ubifs_lprops *lpp;
+        ubifs_get_lprops(c);
+        lpp = ubifs_lpt_lookup(c, lnum);
+        if (IS_ERR(lpp)) {
+                err = PTR_ERR(lpp);
+                goto out;
+        }
+        memcpy(lp, lpp, sizeof(struct ubifs_lprops));
+out:
+        ubifs_release_lprops(c);
+        return err;
+}
+/**
+ * ubifs_fast_find_free - try to find a LEB with free space quickly.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns LEB properties for a LEB with free space or %NULL if
+ * the function is unable to find a LEB quickly.
+ */
+const struct ubifs_lprops *ubifs_fast_find_free(struct ubifs_info *c)
+{
+        struct ubifs_lprops *lprops;
+        struct ubifs_lpt_heap *heap;
+        ubifs_assert(mutex_is_locked(&c->lp_mutex));
+        heap = &c->lpt_heap[LPROPS_FREE - 1];
+        if (heap->cnt == 0)
+                return NULL;
+        lprops = heap->arr[0];
+        ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+        ubifs_assert(!(lprops->flags & LPROPS_INDEX));
+        return lprops;
+}
+/**
+ * ubifs_fast_find_empty - try to find an empty LEB quickly.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns LEB properties for an empty LEB or %NULL if the
+ * function is unable to find an empty LEB quickly.
+ */
+const struct ubifs_lprops *ubifs_fast_find_empty(struct ubifs_info *c)
+{
+        struct ubifs_lprops *lprops;
+        ubifs_assert(mutex_is_locked(&c->lp_mutex));
+        if (list_empty(&c->empty_list))
+                return NULL;
+        lprops = list_entry(c->empty_list.next, struct ubifs_lprops, list);
+        ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+        ubifs_assert(!(lprops->flags & LPROPS_INDEX));
+        ubifs_assert(lprops->free == c->leb_size);
+        return lprops;
+}
+/**
+ * ubifs_fast_find_freeable - try to find a freeable LEB quickly.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns LEB properties for a freeable LEB or %NULL if the
+ * function is unable to find a freeable LEB quickly.
+ */
+const struct ubifs_lprops *ubifs_fast_find_freeable(struct ubifs_info *c)
+{
+        struct ubifs_lprops *lprops;
+        ubifs_assert(mutex_is_locked(&c->lp_mutex));
+        if (list_empty(&c->freeable_list))
+                return NULL;
+        lprops = list_entry(c->freeable_list.next, struct ubifs_lprops, list);
+        ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+        ubifs_assert(!(lprops->flags & LPROPS_INDEX));
+        ubifs_assert(lprops->free + lprops->dirty == c->leb_size);
+        ubifs_assert(c->freeable_cnt > 0);
+        return lprops;
+}
+/**
+ * ubifs_fast_find_frdi_idx - try to find a freeable index LEB quickly.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns LEB properties for a freeable index LEB or %NULL if the
+ * function is unable to find a freeable index LEB quickly.
+ */
+const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c)
+{
+        struct ubifs_lprops *lprops;
+        ubifs_assert(mutex_is_locked(&c->lp_mutex));
+        if (list_empty(&c->frdi_idx_list))
+                return NULL;
+        lprops = list_entry(c->frdi_idx_list.next, struct ubifs_lprops, list);
+        ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+        ubifs_assert((lprops->flags & LPROPS_INDEX));
+        ubifs_assert(lprops->free + lprops->dirty == c->leb_size);
+        return lprops;
+}
+#ifdef CONFIG_UBIFS_FS_DEBUG
+/**
+ * dbg_check_cats - check category heaps and lists.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int dbg_check_cats(struct ubifs_info *c)
+{
+        struct ubifs_lprops *lprops;
+        struct list_head *pos;
+        int i, cat;
+        if (!(ubifs_chk_flags & (UBIFS_CHK_GEN | UBIFS_CHK_LPROPS)))
+                return 0;
+        list_for_each_entry(lprops, &c->empty_list, list) {
+                if (lprops->free != c->leb_size) {
+                        ubifs_err("non-empty LEB %d on empty list "
+                                  "(free %d dirty %d flags %d)", lprops->lnum,
+                                  lprops->free, lprops->dirty, lprops->flags);
+                        return -EINVAL;
+                }
+                if (lprops->flags & LPROPS_TAKEN) {
+                        ubifs_err("taken LEB %d on empty list "
+                                  "(free %d dirty %d flags %d)", lprops->lnum,
+                                  lprops->free, lprops->dirty, lprops->flags);
+                        return -EINVAL;
+                }
+        }
+        i = 0;
+        list_for_each_entry(lprops, &c->freeable_list, list) {
+                if (lprops->free + lprops->dirty != c->leb_size) {
+                        ubifs_err("non-freeable LEB %d on freeable list "
+                                  "(free %d dirty %d flags %d)", lprops->lnum,
+                                  lprops->free, lprops->dirty, lprops->flags);
+                        return -EINVAL;
+                }
+                if (lprops->flags & LPROPS_TAKEN) {
+                        ubifs_err("taken LEB %d on freeable list "
+                                  "(free %d dirty %d flags %d)", lprops->lnum,
+                                  lprops->free, lprops->dirty, lprops->flags);
+                        return -EINVAL;
+                }
+                i += 1;
+        }
+        if (i != c->freeable_cnt) {
+                ubifs_err("freeable list count %d expected %d", i,
+                          c->freeable_cnt);
+                return -EINVAL;
+        }
+        i = 0;
+        list_for_each(pos, &c->idx_gc)
+                i += 1;
+        if (i != c->idx_gc_cnt) {
+                ubifs_err("idx_gc list count %d expected %d", i,
+                          c->idx_gc_cnt);
+                return -EINVAL;
+        }
+        list_for_each_entry(lprops, &c->frdi_idx_list, list) {
+                if (lprops->free + lprops->dirty != c->leb_size) {
+                        ubifs_err("non-freeable LEB %d on frdi_idx list "
+                                  "(free %d dirty %d flags %d)", lprops->lnum,
+                                  lprops->free, lprops->dirty, lprops->flags);
+                        return -EINVAL;
+                }
+                if (lprops->flags & LPROPS_TAKEN) {
+                        ubifs_err("taken LEB %d on frdi_idx list "
+                                  "(free %d dirty %d flags %d)", lprops->lnum,
+                                  lprops->free, lprops->dirty, lprops->flags);
+                        return -EINVAL;
+                }
+                if (!(lprops->flags & LPROPS_INDEX)) {
+                        ubifs_err("non-index LEB %d on frdi_idx list "
+                                  "(free %d dirty %d flags %d)", lprops->lnum,
+                                  lprops->free, lprops->dirty, lprops->flags);
+                        return -EINVAL;
+                }
+        }
+        for (cat = 1; cat <= LPROPS_HEAP_CNT; cat++) {
+                struct ubifs_lpt_heap *heap = &c->lpt_heap[cat - 1];
+                for (i = 0; i < heap->cnt; i++) {
+                        lprops = heap->arr[i];
+                        if (!lprops) {
+                                ubifs_err("null ptr in LPT heap cat %d", cat);
+                                return -EINVAL;
+                        }
+                        if (lprops->hpos != i) {
+                                ubifs_err("bad ptr in LPT heap cat %d", cat);
+                                return -EINVAL;
+                        }
+                        if (lprops->flags & LPROPS_TAKEN) {
+                                ubifs_err("taken LEB in LPT heap cat %d", cat);
+                                return -EINVAL;
+                        }
+                }
+        }
+        return 0;
+}
+void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat,
+                    int add_pos)
+{
+        int i = 0, j, err = 0;
+        if (!(ubifs_chk_flags & (UBIFS_CHK_GEN | UBIFS_CHK_LPROPS)))
+                return;
+        for (i = 0; i < heap->cnt; i++) {
+                struct ubifs_lprops *lprops = heap->arr[i];
+                struct ubifs_lprops *lp;
+                if (i != add_pos)
+                        if ((lprops->flags & LPROPS_CAT_MASK) != cat) {
+                                err = 1;
+                                goto out;
+                        }
+                if (lprops->hpos != i) {
+                        err = 2;
+                        goto out;
+                }
+                lp = ubifs_lpt_lookup(c, lprops->lnum);
+                if (IS_ERR(lp)) {
+                        err = 3;
+                        goto out;
+                }
+                if (lprops != lp) {
+                        dbg_msg("lprops %zx lp %zx lprops->lnum %d lp->lnum %d",
+                                (size_t)lprops, (size_t)lp, lprops->lnum,
+                                lp->lnum);
+                        err = 4;
+                        goto out;
+                }
+                for (j = 0; j < i; j++) {
+                        lp = heap->arr[j];
+                        if (lp == lprops) {
+                                err = 5;
+                                goto out;
+                        }
+                        if (lp->lnum == lprops->lnum) {
+                                err = 6;
+                                goto out;
+                        }
+                }
+        }
+out:
+        if (err) {
+                dbg_msg("failed cat %d hpos %d err %d", cat, i, err);
+                dbg_dump_stack();
+                dbg_dump_heap(c, heap, cat);
+        }
+}
+/**
+ * struct scan_check_data - data provided to scan callback function.
+ * @lst: LEB properties statistics
+ * @err: error code
+ */
+struct scan_check_data {
+        struct ubifs_lp_stats lst;
+        int err;
+};
+/**
+ * scan_check_cb - scan callback.
+ * @c: the UBIFS file-system description object
+ * @lp: LEB properties to scan
+ * @in_tree: whether the LEB properties are in main memory
+ * @data: information passed to and from the caller of the scan
+ *
+ * This function returns a code that indicates whether the scan should continue
+ * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
+ * in main memory (%LPT_SCAN_ADD), or whether the scan should stop
+ * (%LPT_SCAN_STOP).
+ */
+static int scan_check_cb(struct ubifs_info *c,
+                         const struct ubifs_lprops *lp, int in_tree,
+                         struct scan_check_data *data)
+{
+        struct ubifs_scan_leb *sleb;
+        struct ubifs_scan_node *snod;
+        struct ubifs_lp_stats *lst = &data->lst;
+        int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty;
+        cat = lp->flags & LPROPS_CAT_MASK;
+        if (cat != LPROPS_UNCAT) {
+                cat = ubifs_categorize_lprops(c, lp);
+                if (cat != (lp->flags & LPROPS_CAT_MASK)) {
+                        ubifs_err("bad LEB category %d expected %d",
+                                  (lp->flags & LPROPS_CAT_MASK), cat);
+                        goto out;
+                }
+        }
+        /* Check lp is on its category list (if it has one) */
+        if (in_tree) {
+                struct list_head *list = NULL;
+                switch (cat) {
+                case LPROPS_EMPTY:
+                        list = &c->empty_list;
+                        break;
+                case LPROPS_FREEABLE:
+                        list = &c->freeable_list;
+                        break;
+                case LPROPS_FRDI_IDX:
+                        list = &c->frdi_idx_list;
+                        break;
+                case LPROPS_UNCAT:
+                        list = &c->uncat_list;
+                        break;
+                }
+                if (list) {
+                        struct ubifs_lprops *lprops;
+                        int found = 0;
+                        list_for_each_entry(lprops, list, list) {
+                                if (lprops == lp) {
+                                        found = 1;
+                                        break;
+                                }
+                        }
+                        if (!found) {
+                                ubifs_err("bad LPT list (category %d)", cat);
+                                goto out;
+                        }
+                }
+        }
+        /* Check lp is on its category heap (if it has one) */
+        if (in_tree && cat > 0 && cat <= LPROPS_HEAP_CNT) {
+                struct ubifs_lpt_heap *heap = &c->lpt_heap[cat - 1];
+                if ((lp->hpos != -1 && heap->arr[lp->hpos]->lnum != lnum) ||
+                    lp != heap->arr[lp->hpos]) {
+                        ubifs_err("bad LPT heap (category %d)", cat);
+                        goto out;
+                }
+        }
+        sleb = ubifs_scan(c, lnum, 0, c->dbg_buf);
+        if (IS_ERR(sleb)) {
+                /*
+                 * After an unclean unmount, empty and freeable LEBs
+                 * may contain garbage.
+                 */
+                if (lp->free == c->leb_size) {
+                        ubifs_err("scan errors were in empty LEB "
+                                  "- continuing checking");
+                        lst->empty_lebs += 1;
+                        lst->total_free += c->leb_size;
+                        lst->total_dark += calc_dark(c, c->leb_size);
+                        return LPT_SCAN_CONTINUE;
+                }
+                if (lp->free + lp->dirty == c->leb_size &&
+                    !(lp->flags & LPROPS_INDEX)) {
+                        ubifs_err("scan errors were in freeable LEB "
+                                  "- continuing checking");
+                        lst->total_free  += lp->free;
+                        lst->total_dirty += lp->dirty;
+                        lst->total_dark  +=  calc_dark(c, c->leb_size);
+                        return LPT_SCAN_CONTINUE;
+                }
+                data->err = PTR_ERR(sleb);
+                return LPT_SCAN_STOP;
+        }
+        is_idx = -1;
+        list_for_each_entry(snod, &sleb->nodes, list) {
+                int found, level = 0;
+                cond_resched();
+                if (is_idx == -1)
+                        is_idx = (snod->type == UBIFS_IDX_NODE) ? 1 : 0;
+                if (is_idx && snod->type != UBIFS_IDX_NODE) {
+                        ubifs_err("indexing node in data LEB %d:%d",
+                                  lnum, snod->offs);
+                        goto out_destroy;
+                }
+                if (snod->type == UBIFS_IDX_NODE) {
+                        struct ubifs_idx_node *idx = snod->node;
+                        key_read(c, ubifs_idx_key(c, idx), &snod->key);
+                        level = le16_to_cpu(idx->level);
+                }
+                found = ubifs_tnc_has_node(c, &snod->key, level, lnum,
+                                           snod->offs, is_idx);
+                if (found) {
+                        if (found < 0)
+                                goto out_destroy;
+                        used += ALIGN(snod->len, 8);
+                }
+        }
+        free = c->leb_size - sleb->endpt;
+        dirty = sleb->endpt - used;
+        if (free > c->leb_size || free < 0 || dirty > c->leb_size ||
+            dirty < 0) {
+                ubifs_err("bad calculated accounting for LEB %d: "
+                          "free %d, dirty %d", lnum, free, dirty);
+                goto out_destroy;
+        }
+        if (lp->free + lp->dirty == c->leb_size &&
+            free + dirty == c->leb_size)
+                if ((is_idx && !(lp->flags & LPROPS_INDEX)) ||
+                    (!is_idx && free == c->leb_size) ||
+                    lp->free == c->leb_size) {
+                        /*
+                         * Empty or freeable LEBs could contain index
+                         * nodes from an uncompleted commit due to an
+                         * unclean unmount. Or they could be empty for
+                         * the same reason. Or it may simply not have been
+                         * unmapped.
+                         */
+                        free = lp->free;
+                        dirty = lp->dirty;
+                        is_idx = 0;
+                    }
+        if (is_idx && lp->free + lp->dirty == free + dirty &&
+            lnum != c->ihead_lnum) {
+                /*
+                 * After an unclean unmount, an index LEB could have a different
+                 * amount of free space than the value recorded by lprops. That
+                 * is because the in-the-gaps method may use free space or
+                 * create free space (as a side-effect of using ubi_leb_change
+                 * and not writing the whole LEB). The incorrect free space
+                 * value is not a problem because the index is only ever
+                 * allocated empty LEBs, so there will never be an attempt to
+                 * write to the free space at the end of an index LEB - except
+                 * by the in-the-gaps method for which it is not a problem.
+                 */
+                free = lp->free;
+                dirty = lp->dirty;
+        }
+        if (lp->free != free || lp->dirty != dirty)
+                goto out_print;
+        if (is_idx && !(lp->flags & LPROPS_INDEX)) {
+                if (free == c->leb_size)
+                        /* Free but not unmapped LEB, it's fine */
+                        is_idx = 0;
+                else {
+                        ubifs_err("indexing node without indexing "
+                                  "flag");
+                        goto out_print;
+                }
+        }
+        if (!is_idx && (lp->flags & LPROPS_INDEX)) {
+                ubifs_err("data node with indexing flag");
+                goto out_print;
+        }
+        if (free == c->leb_size)
+                lst->empty_lebs += 1;
+        if (is_idx)
+                lst->idx_lebs += 1;
+        if (!(lp->flags & LPROPS_INDEX))
+                lst->total_used += c->leb_size - free - dirty;
+        lst->total_free += free;
+        lst->total_dirty += dirty;
+        if (!(lp->flags & LPROPS_INDEX)) {
+                int spc = free + dirty;
+                if (spc < c->dead_wm)
+                        lst->total_dead += spc;
+                else
+                        lst->total_dark += calc_dark(c, spc);
+        }
+        ubifs_scan_destroy(sleb);
+        return LPT_SCAN_CONTINUE;
+out_print:
+        ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, "
+                  "should be free %d, dirty %d",
+                  lnum, lp->free, lp->dirty, lp->flags, free, dirty);
+        dbg_dump_leb(c, lnum);
+out_destroy:
+        ubifs_scan_destroy(sleb);
+out:
+        data->err = -EINVAL;
+        return LPT_SCAN_STOP;
+}
+/**
+ * dbg_check_lprops - check all LEB properties.
+ * @c: UBIFS file-system description object
+ *
+ * This function checks all LEB properties and makes sure they are all correct.
+ * It returns zero if everything is fine, %-EINVAL if there is an inconsistency
+ * and other negative error codes in case of other errors. This function is
+ * called while the file system is locked (because of commit start), so no
+ * additional locking is required. Note that locking the LPT mutex would cause
+ * a circular lock dependency with the TNC mutex.
+ */
+int dbg_check_lprops(struct ubifs_info *c)
+{
+        int i, err;
+        struct scan_check_data data;
+        struct ubifs_lp_stats *lst = &data.lst;
+        if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
+                return 0;
+        /*
+         * As we are going to scan the media, the write buffers have to be
+         * synchronized.
+         */
+        for (i = 0; i < c->jhead_cnt; i++) {
+                err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
+                if (err)
+                        return err;
+        }
+        memset(lst, 0, sizeof(struct ubifs_lp_stats));
+        data.err = 0;
+        err = ubifs_lpt_scan_nolock(c, c->main_first, c->leb_cnt - 1,
+                                    (ubifs_lpt_scan_callback)scan_check_cb,
+                                    &data);
+        if (err && err != -ENOSPC)
+                goto out;
+        if (data.err) {
+                err = data.err;
+                goto out;
+        }
+        if (lst->empty_lebs != c->lst.empty_lebs ||
+            lst->idx_lebs != c->lst.idx_lebs ||
+            lst->total_free != c->lst.total_free ||
+            lst->total_dirty != c->lst.total_dirty ||
+            lst->total_used != c->lst.total_used) {
+                ubifs_err("bad overall accounting");
+                ubifs_err("calculated: empty_lebs %d, idx_lebs %d, "
+                          "total_free %lld, total_dirty %lld, total_used %lld",
+                          lst->empty_lebs, lst->idx_lebs, lst->total_free,
+                          lst->total_dirty, lst->total_used);
+                ubifs_err("read from lprops: empty_lebs %d, idx_lebs %d, "
+                          "total_free %lld, total_dirty %lld, total_used %lld",
+                          c->lst.empty_lebs, c->lst.idx_lebs, c->lst.total_free,
+                          c->lst.total_dirty, c->lst.total_used);
+                err = -EINVAL;
+                goto out;
+        }
+        if (lst->total_dead != c->lst.total_dead ||
+            lst->total_dark != c->lst.total_dark) {
+                ubifs_err("bad dead/dark space accounting");
+                ubifs_err("calculated: total_dead %lld, total_dark %lld",
+                          lst->total_dead, lst->total_dark);
+                ubifs_err("read from lprops: total_dead %lld, total_dark %lld",
+                          c->lst.total_dead, c->lst.total_dark);
+                err = -EINVAL;
+                goto out;
+        }
+        err = dbg_check_cats(c);
+out:
+        return err;
+}
+#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
new file mode 100644
index 000000000000..9ff2463177e5
--- /dev/null
+++ b/fs/ubifs/lpt.c
@@ -0,0 +1,2243 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+/*
+ * This file implements the LEB properties tree (LPT) area. The LPT area
+ * contains the LEB properties tree, a table of LPT area eraseblocks (ltab), and
+ * (for the "big" model) a table of saved LEB numbers (lsave). The LPT area sits
+ * between the log and the orphan area.
+ *
+ * The LPT area is like a miniature self-contained file system. It is required
+ * that it never runs out of space, is fast to access and update, and scales
+ * logarithmically. The LEB properties tree is implemented as a wandering tree
+ * much like the TNC, and the LPT area has its own garbage collection.
+ *
+ * The LPT has two slightly different forms called the "small model" and the
+ * "big model". The small model is used when the entire LEB properties table
+ * can be written into a single eraseblock. In that case, garbage collection
+ * consists of just writing the whole table, which therefore makes all other
+ * eraseblocks reusable. In the case of the big model, dirty eraseblocks are
+ * selected for garbage collection, which consists are marking the nodes in
+ * that LEB as dirty, and then only the dirty nodes are written out. Also, in
+ * the case of the big model, a table of LEB numbers is saved so that the entire
+ * LPT does not to be scanned looking for empty eraseblocks when UBIFS is first
+ * mounted.
+ */
+#include <linux/crc16.h>
+#include "ubifs.h"
+/**
+ * do_calc_lpt_geom - calculate sizes for the LPT area.
+ * @c: the UBIFS file-system description object
+ *
+ * Calculate the sizes of LPT bit fields, nodes, and tree, based on the
+ * properties of the flash and whether LPT is "big" (c->big_lpt).
+ */
+static void do_calc_lpt_geom(struct ubifs_info *c)
+{
+        int i, n, bits, per_leb_wastage, max_pnode_cnt;
+        long long sz, tot_wastage;
+        n = c->main_lebs + c->max_leb_cnt - c->leb_cnt;
+        max_pnode_cnt = DIV_ROUND_UP(n, UBIFS_LPT_FANOUT);
+        c->lpt_hght = 1;
+        n = UBIFS_LPT_FANOUT;
+        while (n < max_pnode_cnt) {
+                c->lpt_hght += 1;
+                n <<= UBIFS_LPT_FANOUT_SHIFT;
+        }
+        c->pnode_cnt = DIV_ROUND_UP(c->main_lebs, UBIFS_LPT_FANOUT);
+        n = DIV_ROUND_UP(c->pnode_cnt, UBIFS_LPT_FANOUT);
+        c->nnode_cnt = n;
+        for (i = 1; i < c->lpt_hght; i++) {
+                n = DIV_ROUND_UP(n, UBIFS_LPT_FANOUT);
+                c->nnode_cnt += n;
+        }
+        c->space_bits = fls(c->leb_size) - 3;
+        c->lpt_lnum_bits = fls(c->lpt_lebs);
+        c->lpt_offs_bits = fls(c->leb_size - 1);
+        c->lpt_spc_bits = fls(c->leb_size);
+        n = DIV_ROUND_UP(c->max_leb_cnt, UBIFS_LPT_FANOUT);
+        c->pcnt_bits = fls(n - 1);
+        c->lnum_bits = fls(c->max_leb_cnt - 1);
+        bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS +
+               (c->big_lpt ? c->pcnt_bits : 0) +
+               (c->space_bits * 2 + 1) * UBIFS_LPT_FANOUT;
+        c->pnode_sz = (bits + 7) / 8;
+        bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS +
+               (c->big_lpt ? c->pcnt_bits : 0) +
+               (c->lpt_lnum_bits + c->lpt_offs_bits) * UBIFS_LPT_FANOUT;
+        c->nnode_sz = (bits + 7) / 8;
+        bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS +
+               c->lpt_lebs * c->lpt_spc_bits * 2;
+        c->ltab_sz = (bits + 7) / 8;
+        bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS +
+               c->lnum_bits * c->lsave_cnt;
+        c->lsave_sz = (bits + 7) / 8;
+        /* Calculate the minimum LPT size */
+        c->lpt_sz = (long long)c->pnode_cnt * c->pnode_sz;
+        c->lpt_sz += (long long)c->nnode_cnt * c->nnode_sz;
+        c->lpt_sz += c->ltab_sz;
+        c->lpt_sz += c->lsave_sz;
+        /* Add wastage */
+        sz = c->lpt_sz;
+        per_leb_wastage = max_t(int, c->pnode_sz, c->nnode_sz);
+        sz += per_leb_wastage;
+        tot_wastage = per_leb_wastage;
+        while (sz > c->leb_size) {
+                sz += per_leb_wastage;
+                sz -= c->leb_size;
+                tot_wastage += per_leb_wastage;
+        }
+        tot_wastage += ALIGN(sz, c->min_io_size) - sz;
+        c->lpt_sz += tot_wastage;
+}
+/**
+ * ubifs_calc_lpt_geom - calculate and check sizes for the LPT area.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_calc_lpt_geom(struct ubifs_info *c)
+{
+        int lebs_needed;
+        uint64_t sz;
+        do_calc_lpt_geom(c);
+        /* Verify that lpt_lebs is big enough */
+        sz = c->lpt_sz * 2; /* Must have at least 2 times the size */
+        sz += c->leb_size - 1;
+        do_div(sz, c->leb_size);
+        lebs_needed = sz;
+        if (lebs_needed > c->lpt_lebs) {
+                ubifs_err("too few LPT LEBs");
+                return -EINVAL;
+        }
+        /* Verify that ltab fits in a single LEB (since ltab is a single node */
+        if (c->ltab_sz > c->leb_size) {
+                ubifs_err("LPT ltab too big");
+                return -EINVAL;
+        }
+        c->check_lpt_free = c->big_lpt;
+        return 0;
+}
+/**
+ * calc_dflt_lpt_geom - calculate default LPT geometry.
+ * @c: the UBIFS file-system description object
+ * @main_lebs: number of main area LEBs is passed and returned here
+ * @big_lpt: whether the LPT area is "big" is returned here
+ *
+ * The size of the LPT area depends on parameters that themselves are dependent
+ * on the size of the LPT area. This function, successively recalculates the LPT
+ * area geometry until the parameters and resultant geometry are consistent.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int calc_dflt_lpt_geom(struct ubifs_info *c, int *main_lebs,
+                              int *big_lpt)
+{
+        int i, lebs_needed;
+        uint64_t sz;
+        /* Start by assuming the minimum number of LPT LEBs */
+        c->lpt_lebs = UBIFS_MIN_LPT_LEBS;
+        c->main_lebs = *main_lebs - c->lpt_lebs;
+        if (c->main_lebs <= 0)
+                return -EINVAL;
+        /* And assume we will use the small LPT model */
+        c->big_lpt = 0;
+        /*
+         * Calculate the geometry based on assumptions above and then see if it
+         * makes sense
+         */
+        do_calc_lpt_geom(c);
+        /* Small LPT model must have lpt_sz < leb_size */
+        if (c->lpt_sz > c->leb_size) {
+                /* Nope, so try again using big LPT model */
+                c->big_lpt = 1;
+                do_calc_lpt_geom(c);
+        }
+        /* Now check there are enough LPT LEBs */
+        for (i = 0; i < 64 ; i++) {
+                sz = c->lpt_sz * 4; /* Allow 4 times the size */
+                sz += c->leb_size - 1;
+                do_div(sz, c->leb_size);
+                lebs_needed = sz;
+                if (lebs_needed > c->lpt_lebs) {
+                        /* Not enough LPT LEBs so try again with more */
+                        c->lpt_lebs = lebs_needed;
+                        c->main_lebs = *main_lebs - c->lpt_lebs;
+                        if (c->main_lebs <= 0)
+                                return -EINVAL;
+                        do_calc_lpt_geom(c);
+                        continue;
+                }
+                if (c->ltab_sz > c->leb_size) {
+                        ubifs_err("LPT ltab too big");
+                        return -EINVAL;
+                }
+                *main_lebs = c->main_lebs;
+                *big_lpt = c->big_lpt;
+                return 0;
+        }
+        return -EINVAL;
+}
+/**
+ * pack_bits - pack bit fields end-to-end.
+ * @addr: address at which to pack (passed and next address returned)
+ * @pos: bit position at which to pack (passed and next position returned)
+ * @val: value to pack
+ * @nrbits: number of bits of value to pack (1-32)
+ */
+static void pack_bits(uint8_t **addr, int *pos, uint32_t val, int nrbits)
+{
+        uint8_t *p = *addr;
+        int b = *pos;
+        ubifs_assert(nrbits > 0);
+        ubifs_assert(nrbits <= 32);
+        ubifs_assert(*pos >= 0);
+        ubifs_assert(*pos < 8);
+        ubifs_assert((val >> nrbits) == 0 || nrbits == 32);
+        if (b) {
+                *p |= ((uint8_t)val) << b;
+                nrbits += b;
+                if (nrbits > 8) {
+                        *++p = (uint8_t)(val >>= (8 - b));
+                        if (nrbits > 16) {
+                                *++p = (uint8_t)(val >>= 8);
+                                if (nrbits > 24) {
+                                        *++p = (uint8_t)(val >>= 8);
+                                        if (nrbits > 32)
+                                                *++p = (uint8_t)(val >>= 8);
+                                }
+                        }
+                }
+        } else {
+                *p = (uint8_t)val;
+                if (nrbits > 8) {
+                        *++p = (uint8_t)(val >>= 8);
+                        if (nrbits > 16) {
+                                *++p = (uint8_t)(val >>= 8);
+                                if (nrbits > 24)
+                                        *++p = (uint8_t)(val >>= 8);
+                        }
+                }
+        }
+        b = nrbits & 7;
+        if (b == 0)
+                p++;
+        *addr = p;
+        *pos = b;
+}
+/**
+ * ubifs_unpack_bits - unpack bit fields.
+ * @addr: address at which to unpack (passed and next address returned)
+ * @pos: bit position at which to unpack (passed and next position returned)
+ * @nrbits: number of bits of value to unpack (1-32)
+ *
+ * This functions returns the value unpacked.
+ */
+uint32_t ubifs_unpack_bits(uint8_t **addr, int *pos, int nrbits)
+{
+        const int k = 32 - nrbits;
+        uint8_t *p = *addr;
+        int b = *pos;
+        uint32_t val;
+        ubifs_assert(nrbits > 0);
+        ubifs_assert(nrbits <= 32);
+        ubifs_assert(*pos >= 0);
+        ubifs_assert(*pos < 8);
+        if (b) {
+                val = p[1] | ((uint32_t)p[2] << 8) | ((uint32_t)p[3] << 16) |
+                      ((uint32_t)p[4] << 24);
+                val <<= (8 - b);
+                val |= *p >> b;
+                nrbits += b;
+        } else
+                val = p[0] | ((uint32_t)p[1] << 8) | ((uint32_t)p[2] << 16) |
+                      ((uint32_t)p[3] << 24);
+        val <<= k;
+        val >>= k;
+        b = nrbits & 7;
+        p += nrbits / 8;
+        *addr = p;
+        *pos = b;
+        ubifs_assert((val >> nrbits) == 0 || nrbits - b == 32);
+        return val;
+}
+/**
+ * ubifs_pack_pnode - pack all the bit fields of a pnode.
+ * @c: UBIFS file-system description object
+ * @buf: buffer into which to pack
+ * @pnode: pnode to pack
+ */
+void ubifs_pack_pnode(struct ubifs_info *c, void *buf,
+                      struct ubifs_pnode *pnode)
+{
+        uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+        int i, pos = 0;
+        uint16_t crc;
+        pack_bits(&addr, &pos, UBIFS_LPT_PNODE, UBIFS_LPT_TYPE_BITS);
+        if (c->big_lpt)
+                pack_bits(&addr, &pos, pnode->num, c->pcnt_bits);
+        for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+                pack_bits(&addr, &pos, pnode->lprops[i].free >> 3,
+                          c->space_bits);
+                pack_bits(&addr, &pos, pnode->lprops[i].dirty >> 3,
+                          c->space_bits);
+                if (pnode->lprops[i].flags & LPROPS_INDEX)
+                        pack_bits(&addr, &pos, 1, 1);
+                else
+                        pack_bits(&addr, &pos, 0, 1);
+        }
+        crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
+                    c->pnode_sz - UBIFS_LPT_CRC_BYTES);
+        addr = buf;
+        pos = 0;
+        pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS);
+}
+/**
+ * ubifs_pack_nnode - pack all the bit fields of a nnode.
+ * @c: UBIFS file-system description object
+ * @buf: buffer into which to pack
+ * @nnode: nnode to pack
+ */
+void ubifs_pack_nnode(struct ubifs_info *c, void *buf,
+                      struct ubifs_nnode *nnode)
+{
+        uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+        int i, pos = 0;
+        uint16_t crc;
+        pack_bits(&addr, &pos, UBIFS_LPT_NNODE, UBIFS_LPT_TYPE_BITS);
+        if (c->big_lpt)
+                pack_bits(&addr, &pos, nnode->num, c->pcnt_bits);
+        for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+                int lnum = nnode->nbranch[i].lnum;
+                if (lnum == 0)
+                        lnum = c->lpt_last + 1;
+                pack_bits(&addr, &pos, lnum - c->lpt_first, c->lpt_lnum_bits);
+                pack_bits(&addr, &pos, nnode->nbranch[i].offs,
+                          c->lpt_offs_bits);
+        }
+        crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
+                    c->nnode_sz - UBIFS_LPT_CRC_BYTES);
+        addr = buf;
+        pos = 0;
+        pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS);
+}
+/**
+ * ubifs_pack_ltab - pack the LPT's own lprops table.
+ * @c: UBIFS file-system description object
+ * @buf: buffer into which to pack
+ * @ltab: LPT's own lprops table to pack
+ */
+void ubifs_pack_ltab(struct ubifs_info *c, void *buf,
+                     struct ubifs_lpt_lprops *ltab)
+{
+        uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+        int i, pos = 0;
+        uint16_t crc;
+        pack_bits(&addr, &pos, UBIFS_LPT_LTAB, UBIFS_LPT_TYPE_BITS);
+        for (i = 0; i < c->lpt_lebs; i++) {
+                pack_bits(&addr, &pos, ltab[i].free, c->lpt_spc_bits);
+                pack_bits(&addr, &pos, ltab[i].dirty, c->lpt_spc_bits);
+        }
+        crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
+                    c->ltab_sz - UBIFS_LPT_CRC_BYTES);
+        addr = buf;
+        pos = 0;
+        pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS);
+}
+/**
+ * ubifs_pack_lsave - pack the LPT's save table.
+ * @c: UBIFS file-system description object
+ * @buf: buffer into which to pack
+ * @lsave: LPT's save table to pack
+ */
+void ubifs_pack_lsave(struct ubifs_info *c, void *buf, int *lsave)
+{
+        uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+        int i, pos = 0;
+        uint16_t crc;
+        pack_bits(&addr, &pos, UBIFS_LPT_LSAVE, UBIFS_LPT_TYPE_BITS);
+        for (i = 0; i < c->lsave_cnt; i++)
+                pack_bits(&addr, &pos, lsave[i], c->lnum_bits);
+        crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
+                    c->lsave_sz - UBIFS_LPT_CRC_BYTES);
+        addr = buf;
+        pos = 0;
+        pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS);
+}
+/**
+ * ubifs_add_lpt_dirt - add dirty space to LPT LEB properties.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to which to add dirty space
+ * @dirty: amount of dirty space to add
+ */
+void ubifs_add_lpt_dirt(struct ubifs_info *c, int lnum, int dirty)
+{
+        if (!dirty || !lnum)
+                return;
+        dbg_lp("LEB %d add %d to %d",
+               lnum, dirty, c->ltab[lnum - c->lpt_first].dirty);
+        ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last);
+        c->ltab[lnum - c->lpt_first].dirty += dirty;
+}
+/**
+ * set_ltab - set LPT LEB properties.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number
+ * @free: amount of free space
+ * @dirty: amount of dirty space
+ */
+static void set_ltab(struct ubifs_info *c, int lnum, int free, int dirty)
+{
+        dbg_lp("LEB %d free %d dirty %d to %d %d",
+               lnum, c->ltab[lnum - c->lpt_first].free,
+               c->ltab[lnum - c->lpt_first].dirty, free, dirty);
+        ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last);
+        c->ltab[lnum - c->lpt_first].free = free;
+        c->ltab[lnum - c->lpt_first].dirty = dirty;
+}
+/**
+ * ubifs_add_nnode_dirt - add dirty space to LPT LEB properties.
+ * @c: UBIFS file-system description object
+ * @nnode: nnode for which to add dirt
+ */
+void ubifs_add_nnode_dirt(struct ubifs_info *c, struct ubifs_nnode *nnode)
+{
+        struct ubifs_nnode *np = nnode->parent;
+        if (np)
+                ubifs_add_lpt_dirt(c, np->nbranch[nnode->iip].lnum,
+                                   c->nnode_sz);
+        else {
+                ubifs_add_lpt_dirt(c, c->lpt_lnum, c->nnode_sz);
+                if (!(c->lpt_drty_flgs & LTAB_DIRTY)) {
+                        c->lpt_drty_flgs |= LTAB_DIRTY;
+                        ubifs_add_lpt_dirt(c, c->ltab_lnum, c->ltab_sz);
+                }
+        }
+}
+/**
+ * add_pnode_dirt - add dirty space to LPT LEB properties.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode for which to add dirt
+ */
+static void add_pnode_dirt(struct ubifs_info *c, struct ubifs_pnode *pnode)
+{
+        ubifs_add_lpt_dirt(c, pnode->parent->nbranch[pnode->iip].lnum,
+                           c->pnode_sz);
+}
+/**
+ * calc_nnode_num - calculate nnode number.
+ * @row: the row in the tree (root is zero)
+ * @col: the column in the row (leftmost is zero)
+ *
+ * The nnode number is a number that uniquely identifies a nnode and can be used
+ * easily to traverse the tree from the root to that nnode.
+ *
+ * This function calculates and returns the nnode number for the nnode at @row
+ * and @col.
+ */
+static int calc_nnode_num(int row, int col)
+{
+        int num, bits;
+        num = 1;
+        while (row--) {
+                bits = (col & (UBIFS_LPT_FANOUT - 1));
+                col >>= UBIFS_LPT_FANOUT_SHIFT;
+                num <<= UBIFS_LPT_FANOUT_SHIFT;
+                num |= bits;
+        }
+        return num;
+}
+/**
+ * calc_nnode_num_from_parent - calculate nnode number.
+ * @c: UBIFS file-system description object
+ * @parent: parent nnode
+ * @iip: index in parent
+ *
+ * The nnode number is a number that uniquely identifies a nnode and can be used
+ * easily to traverse the tree from the root to that nnode.
+ *
+ * This function calculates and returns the nnode number based on the parent's
+ * nnode number and the index in parent.
+ */
+static int calc_nnode_num_from_parent(struct ubifs_info *c,
+                                      struct ubifs_nnode *parent, int iip)
+{
+        int num, shft;
+        if (!parent)
+                return 1;
+        shft = (c->lpt_hght - parent->level) * UBIFS_LPT_FANOUT_SHIFT;
+        num = parent->num ^ (1 << shft);
+        num |= (UBIFS_LPT_FANOUT + iip) << shft;
+        return num;
+}
+/**
+ * calc_pnode_num_from_parent - calculate pnode number.
+ * @c: UBIFS file-system description object
+ * @parent: parent nnode
+ * @iip: index in parent
+ *
+ * The pnode number is a number that uniquely identifies a pnode and can be used
+ * easily to traverse the tree from the root to that pnode.
+ *
+ * This function calculates and returns the pnode number based on the parent's
+ * nnode number and the index in parent.
+ */
+static int calc_pnode_num_from_parent(struct ubifs_info *c,
+                                      struct ubifs_nnode *parent, int iip)
+{
+        int i, n = c->lpt_hght - 1, pnum = parent->num, num = 0;
+        for (i = 0; i < n; i++) {
+                num <<= UBIFS_LPT_FANOUT_SHIFT;
+                num |= pnum & (UBIFS_LPT_FANOUT - 1);
+                pnum >>= UBIFS_LPT_FANOUT_SHIFT;
+        }
+        num <<= UBIFS_LPT_FANOUT_SHIFT;
+        num |= iip;
+        return num;
+}
+/**
+ * ubifs_create_dflt_lpt - create default LPT.
+ * @c: UBIFS file-system description object
+ * @main_lebs: number of main area LEBs is passed and returned here
+ * @lpt_first: LEB number of first LPT LEB
+ * @lpt_lebs: number of LEBs for LPT is passed and returned here
+ * @big_lpt: use big LPT model is passed and returned here
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first,
+                          int *lpt_lebs, int *big_lpt)
+{
+        int lnum, err = 0, node_sz, iopos, i, j, cnt, len, alen, row;
+        int blnum, boffs, bsz, bcnt;
+        struct ubifs_pnode *pnode = NULL;
+        struct ubifs_nnode *nnode = NULL;
+        void *buf = NULL, *p;
+        struct ubifs_lpt_lprops *ltab = NULL;
+        int *lsave = NULL;
+        err = calc_dflt_lpt_geom(c, main_lebs, big_lpt);
+        if (err)
+                return err;
+        *lpt_lebs = c->lpt_lebs;
+        /* Needed by 'ubifs_pack_nnode()' and 'set_ltab()' */
+        c->lpt_first = lpt_first;
+        /* Needed by 'set_ltab()' */
+        c->lpt_last = lpt_first + c->lpt_lebs - 1;
+        /* Needed by 'ubifs_pack_lsave()' */
+        c->main_first = c->leb_cnt - *main_lebs;
+        lsave = kmalloc(sizeof(int) * c->lsave_cnt, GFP_KERNEL);
+        pnode = kzalloc(sizeof(struct ubifs_pnode), GFP_KERNEL);
+        nnode = kzalloc(sizeof(struct ubifs_nnode), GFP_KERNEL);
+        buf = vmalloc(c->leb_size);
+        ltab = vmalloc(sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs);
+        if (!pnode || !nnode || !buf || !ltab || !lsave) {
+                err = -ENOMEM;
+                goto out;
+        }
+        ubifs_assert(!c->ltab);
+        c->ltab = ltab; /* Needed by set_ltab */
+        /* Initialize LPT's own lprops */
+        for (i = 0; i < c->lpt_lebs; i++) {
+                ltab[i].free = c->leb_size;
+                ltab[i].dirty = 0;
+                ltab[i].tgc = 0;
+                ltab[i].cmt = 0;
+        }
+        lnum = lpt_first;
+        p = buf;
+        /* Number of leaf nodes (pnodes) */
+        cnt = c->pnode_cnt;
+        /*
+         * The first pnode contains the LEB properties for the LEBs that contain
+         * the root inode node and the root index node of the index tree.
+         */
+        node_sz = ALIGN(ubifs_idx_node_sz(c, 1), 8);
+        iopos = ALIGN(node_sz, c->min_io_size);
+        pnode->lprops[0].free = c->leb_size - iopos;
+        pnode->lprops[0].dirty = iopos - node_sz;
+        pnode->lprops[0].flags = LPROPS_INDEX;
+        node_sz = UBIFS_INO_NODE_SZ;
+        iopos = ALIGN(node_sz, c->min_io_size);
+        pnode->lprops[1].free = c->leb_size - iopos;
+        pnode->lprops[1].dirty = iopos - node_sz;
+        for (i = 2; i < UBIFS_LPT_FANOUT; i++)
+                pnode->lprops[i].free = c->leb_size;
+        /* Add first pnode */
+        ubifs_pack_pnode(c, p, pnode);
+        p += c->pnode_sz;
+        len = c->pnode_sz;
+        pnode->num += 1;
+        /* Reset pnode values for remaining pnodes */
+        pnode->lprops[0].free = c->leb_size;
+        pnode->lprops[0].dirty = 0;
+        pnode->lprops[0].flags = 0;
+        pnode->lprops[1].free = c->leb_size;
+        pnode->lprops[1].dirty = 0;
+        /*
+         * To calculate the internal node branches, we keep information about
+         * the level below.
+         */
+        blnum = lnum; /* LEB number of level below */
+        boffs = 0; /* Offset of level below */
+        bcnt = cnt; /* Number of nodes in level below */
+        bsz = c->pnode_sz; /* Size of nodes in level below */
+        /* Add all remaining pnodes */
+        for (i = 1; i < cnt; i++) {
+                if (len + c->pnode_sz > c->leb_size) {
+                        alen = ALIGN(len, c->min_io_size);
+                        set_ltab(c, lnum, c->leb_size - alen, alen - len);
+                        memset(p, 0xff, alen - len);
+                        err = ubi_leb_change(c->ubi, lnum++, buf, alen,
+                                             UBI_SHORTTERM);
+                        if (err)
+                                goto out;
+                        p = buf;
+                        len = 0;
+                }
+                ubifs_pack_pnode(c, p, pnode);
+                p += c->pnode_sz;
+                len += c->pnode_sz;
+                /*
+                 * pnodes are simply numbered left to right starting at zero,
+                 * which means the pnode number can be used easily to traverse
+                 * down the tree to the corresponding pnode.
+                 */
+                pnode->num += 1;
+        }
+        row = 0;
+        for (i = UBIFS_LPT_FANOUT; cnt > i; i <<= UBIFS_LPT_FANOUT_SHIFT)
+                row += 1;
+        /* Add all nnodes, one level at a time */
+        while (1) {
+                /* Number of internal nodes (nnodes) at next level */
+                cnt = DIV_ROUND_UP(cnt, UBIFS_LPT_FANOUT);
+                for (i = 0; i < cnt; i++) {
+                        if (len + c->nnode_sz > c->leb_size) {
+                                alen = ALIGN(len, c->min_io_size);
+                                set_ltab(c, lnum, c->leb_size - alen,
+                                            alen - len);
+                                memset(p, 0xff, alen - len);
+                                err = ubi_leb_change(c->ubi, lnum++, buf, alen,
+                                                     UBI_SHORTTERM);
+                                if (err)
+                                        goto out;
+                                p = buf;
+                                len = 0;
+                        }
+                        /* Only 1 nnode at this level, so it is the root */
+                        if (cnt == 1) {
+                                c->lpt_lnum = lnum;
+                                c->lpt_offs = len;
+                        }
+                        /* Set branches to the level below */
+                        for (j = 0; j < UBIFS_LPT_FANOUT; j++) {
+                                if (bcnt) {
+                                        if (boffs + bsz > c->leb_size) {
+                                                blnum += 1;
+                                                boffs = 0;
+                                        }
+                                        nnode->nbranch[j].lnum = blnum;
+                                        nnode->nbranch[j].offs = boffs;
+                                        boffs += bsz;
+                                        bcnt--;
+                                } else {
+                                        nnode->nbranch[j].lnum = 0;
+                                        nnode->nbranch[j].offs = 0;
+                                }
+                        }
+                        nnode->num = calc_nnode_num(row, i);
+                        ubifs_pack_nnode(c, p, nnode);
+                        p += c->nnode_sz;
+                        len += c->nnode_sz;
+                }
+                /* Only 1 nnode at this level, so it is the root */
+                if (cnt == 1)
+                        break;
+                /* Update the information about the level below */
+                bcnt = cnt;
+                bsz = c->nnode_sz;
+                row -= 1;
+        }
+        if (*big_lpt) {
+                /* Need to add LPT's save table */
+                if (len + c->lsave_sz > c->leb_size) {
+                        alen = ALIGN(len, c->min_io_size);
+                        set_ltab(c, lnum, c->leb_size - alen, alen - len);
+                        memset(p, 0xff, alen - len);
+                        err = ubi_leb_change(c->ubi, lnum++, buf, alen,
+                                             UBI_SHORTTERM);
+                        if (err)
+                                goto out;
+                        p = buf;
+                        len = 0;
+                }
+                c->lsave_lnum = lnum;
+                c->lsave_offs = len;
+                for (i = 0; i < c->lsave_cnt && i < *main_lebs; i++)
+                        lsave[i] = c->main_first + i;
+                for (; i < c->lsave_cnt; i++)
+                        lsave[i] = c->main_first;
+                ubifs_pack_lsave(c, p, lsave);
+                p += c->lsave_sz;
+                len += c->lsave_sz;
+        }
+        /* Need to add LPT's own LEB properties table */
+        if (len + c->ltab_sz > c->leb_size) {
+                alen = ALIGN(len, c->min_io_size);
+                set_ltab(c, lnum, c->leb_size - alen, alen - len);
+                memset(p, 0xff, alen - len);
+                err = ubi_leb_change(c->ubi, lnum++, buf, alen, UBI_SHORTTERM);
+                if (err)
+                        goto out;
+                p = buf;
+                len = 0;
+        }
+        c->ltab_lnum = lnum;
+        c->ltab_offs = len;
+        /* Update ltab before packing it */
+        len += c->ltab_sz;
+        alen = ALIGN(len, c->min_io_size);
+        set_ltab(c, lnum, c->leb_size - alen, alen - len);
+        ubifs_pack_ltab(c, p, ltab);
+        p += c->ltab_sz;
+        /* Write remaining buffer */
+        memset(p, 0xff, alen - len);
+        err = ubi_leb_change(c->ubi, lnum, buf, alen, UBI_SHORTTERM);
+        if (err)
+                goto out;
+        c->nhead_lnum = lnum;
+        c->nhead_offs = ALIGN(len, c->min_io_size);
+        dbg_lp("space_bits %d", c->space_bits);
+        dbg_lp("lpt_lnum_bits %d", c->lpt_lnum_bits);
+        dbg_lp("lpt_offs_bits %d", c->lpt_offs_bits);
+        dbg_lp("lpt_spc_bits %d", c->lpt_spc_bits);
+        dbg_lp("pcnt_bits %d", c->pcnt_bits);
+        dbg_lp("lnum_bits %d", c->lnum_bits);
+        dbg_lp("pnode_sz %d", c->pnode_sz);
+        dbg_lp("nnode_sz %d", c->nnode_sz);
+        dbg_lp("ltab_sz %d", c->ltab_sz);
+        dbg_lp("lsave_sz %d", c->lsave_sz);
+        dbg_lp("lsave_cnt %d", c->lsave_cnt);
+        dbg_lp("lpt_hght %d", c->lpt_hght);
+        dbg_lp("big_lpt %d", c->big_lpt);
+        dbg_lp("LPT root is at %d:%d", c->lpt_lnum, c->lpt_offs);
+        dbg_lp("LPT head is at %d:%d", c->nhead_lnum, c->nhead_offs);
+        dbg_lp("LPT ltab is at %d:%d", c->ltab_lnum, c->ltab_offs);
+        if (c->big_lpt)
+                dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs);
+out:
+        c->ltab = NULL;
+        kfree(lsave);
+        vfree(ltab);
+        vfree(buf);
+        kfree(nnode);
+        kfree(pnode);
+        return err;
+}
+/**
+ * update_cats - add LEB properties of a pnode to LEB category lists and heaps.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode
+ *
+ * When a pnode is loaded into memory, the LEB properties it contains are added,
+ * by this function, to the LEB category lists and heaps.
+ */
+static void update_cats(struct ubifs_info *c, struct ubifs_pnode *pnode)
+{
+        int i;
+        for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+                int cat = pnode->lprops[i].flags & LPROPS_CAT_MASK;
+                int lnum = pnode->lprops[i].lnum;
+                if (!lnum)
+                        return;
+                ubifs_add_to_cat(c, &pnode->lprops[i], cat);
+        }
+}
+/**
+ * replace_cats - add LEB properties of a pnode to LEB category lists and heaps.
+ * @c: UBIFS file-system description object
+ * @old_pnode: pnode copied
+ * @new_pnode: pnode copy
+ *
+ * During commit it is sometimes necessary to copy a pnode
+ * (see dirty_cow_pnode).  When that happens, references in
+ * category lists and heaps must be replaced.  This function does that.
+ */
+static void replace_cats(struct ubifs_info *c, struct ubifs_pnode *old_pnode,
+                         struct ubifs_pnode *new_pnode)
+{
+        int i;
+        for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+                if (!new_pnode->lprops[i].lnum)
+                        return;
+                ubifs_replace_cat(c, &old_pnode->lprops[i],
+                                  &new_pnode->lprops[i]);
+        }
+}
+/**
+ * check_lpt_crc - check LPT node crc is correct.
+ * @c: UBIFS file-system description object
+ * @buf: buffer containing node
+ * @len: length of node
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int check_lpt_crc(void *buf, int len)
+{
+        int pos = 0;
+        uint8_t *addr = buf;
+        uint16_t crc, calc_crc;
+        crc = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_CRC_BITS);
+        calc_crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
+                         len - UBIFS_LPT_CRC_BYTES);
+        if (crc != calc_crc) {
+                ubifs_err("invalid crc in LPT node: crc %hx calc %hx", crc,
+                          calc_crc);
+                dbg_dump_stack();
+                return -EINVAL;
+        }
+        return 0;
+}
+/**
+ * check_lpt_type - check LPT node type is correct.
+ * @c: UBIFS file-system description object
+ * @addr: address of type bit field is passed and returned updated here
+ * @pos: position of type bit field is passed and returned updated here
+ * @type: expected type
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int check_lpt_type(uint8_t **addr, int *pos, int type)
+{
+        int node_type;
+        node_type = ubifs_unpack_bits(addr, pos, UBIFS_LPT_TYPE_BITS);
+        if (node_type != type) {
+                ubifs_err("invalid type (%d) in LPT node type %d", node_type,
+                          type);
+                dbg_dump_stack();
+                return -EINVAL;
+        }
+        return 0;
+}
+/**
+ * unpack_pnode - unpack a pnode.
+ * @c: UBIFS file-system description object
+ * @buf: buffer containing packed pnode to unpack
+ * @pnode: pnode structure to fill
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int unpack_pnode(struct ubifs_info *c, void *buf,
+                        struct ubifs_pnode *pnode)
+{
+        uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+        int i, pos = 0, err;
+        err = check_lpt_type(&addr, &pos, UBIFS_LPT_PNODE);
+        if (err)
+                return err;
+        if (c->big_lpt)
+                pnode->num = ubifs_unpack_bits(&addr, &pos, c->pcnt_bits);
+        for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+                struct ubifs_lprops * const lprops = &pnode->lprops[i];
+                lprops->free = ubifs_unpack_bits(&addr, &pos, c->space_bits);
+                lprops->free <<= 3;
+                lprops->dirty = ubifs_unpack_bits(&addr, &pos, c->space_bits);
+                lprops->dirty <<= 3;
+                if (ubifs_unpack_bits(&addr, &pos, 1))
+                        lprops->flags = LPROPS_INDEX;
+                else
+                        lprops->flags = 0;
+                lprops->flags |= ubifs_categorize_lprops(c, lprops);
+        }
+        err = check_lpt_crc(buf, c->pnode_sz);
+        return err;
+}
+/**
+ * unpack_nnode - unpack a nnode.
+ * @c: UBIFS file-system description object
+ * @buf: buffer containing packed nnode to unpack
+ * @nnode: nnode structure to fill
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int unpack_nnode(struct ubifs_info *c, void *buf,
+                        struct ubifs_nnode *nnode)
+{
+        uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+        int i, pos = 0, err;
+        err = check_lpt_type(&addr, &pos, UBIFS_LPT_NNODE);
+        if (err)
+                return err;
+        if (c->big_lpt)
+                nnode->num = ubifs_unpack_bits(&addr, &pos, c->pcnt_bits);
+        for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+                int lnum;
+                lnum = ubifs_unpack_bits(&addr, &pos, c->lpt_lnum_bits) +
+                       c->lpt_first;
+                if (lnum == c->lpt_last + 1)
+                        lnum = 0;
+                nnode->nbranch[i].lnum = lnum;
+                nnode->nbranch[i].offs = ubifs_unpack_bits(&addr, &pos,
+                                                     c->lpt_offs_bits);
+        }
+        err = check_lpt_crc(buf, c->nnode_sz);
+        return err;
+}
+/**
+ * unpack_ltab - unpack the LPT's own lprops table.
+ * @c: UBIFS file-system description object
+ * @buf: buffer from which to unpack
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int unpack_ltab(struct ubifs_info *c, void *buf)
+{
+        uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+        int i, pos = 0, err;
+        err = check_lpt_type(&addr, &pos, UBIFS_LPT_LTAB);
+        if (err)
+                return err;
+        for (i = 0; i < c->lpt_lebs; i++) {
+                int free = ubifs_unpack_bits(&addr, &pos, c->lpt_spc_bits);
+                int dirty = ubifs_unpack_bits(&addr, &pos, c->lpt_spc_bits);
+                if (free < 0 || free > c->leb_size || dirty < 0 ||
+                    dirty > c->leb_size || free + dirty > c->leb_size)
+                        return -EINVAL;
+                c->ltab[i].free = free;
+                c->ltab[i].dirty = dirty;
+                c->ltab[i].tgc = 0;
+                c->ltab[i].cmt = 0;
+        }
+        err = check_lpt_crc(buf, c->ltab_sz);
+        return err;
+}
+/**
+ * unpack_lsave - unpack the LPT's save table.
+ * @c: UBIFS file-system description object
+ * @buf: buffer from which to unpack
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int unpack_lsave(struct ubifs_info *c, void *buf)
+{
+        uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+        int i, pos = 0, err;
+        err = check_lpt_type(&addr, &pos, UBIFS_LPT_LSAVE);
+        if (err)
+                return err;
+        for (i = 0; i < c->lsave_cnt; i++) {
+                int lnum = ubifs_unpack_bits(&addr, &pos, c->lnum_bits);
+                if (lnum < c->main_first || lnum >= c->leb_cnt)
+                        return -EINVAL;
+                c->lsave[i] = lnum;
+        }
+        err = check_lpt_crc(buf, c->lsave_sz);
+        return err;
+}
+/**
+ * validate_nnode - validate a nnode.
+ * @c: UBIFS file-system description object
+ * @nnode: nnode to validate
+ * @parent: parent nnode (or NULL for the root nnode)
+ * @iip: index in parent
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int validate_nnode(struct ubifs_info *c, struct ubifs_nnode *nnode,
+                          struct ubifs_nnode *parent, int iip)
+{
+        int i, lvl, max_offs;
+        if (c->big_lpt) {
+                int num = calc_nnode_num_from_parent(c, parent, iip);
+                if (nnode->num != num)
+                        return -EINVAL;
+        }
+        lvl = parent ? parent->level - 1 : c->lpt_hght;
+        if (lvl < 1)
+                return -EINVAL;
+        if (lvl == 1)
+                max_offs = c->leb_size - c->pnode_sz;
+        else
+                max_offs = c->leb_size - c->nnode_sz;
+        for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+                int lnum = nnode->nbranch[i].lnum;
+                int offs = nnode->nbranch[i].offs;
+                if (lnum == 0) {
+                        if (offs != 0)
+                                return -EINVAL;
+                        continue;
+                }
+                if (lnum < c->lpt_first || lnum > c->lpt_last)
+                        return -EINVAL;
+                if (offs < 0 || offs > max_offs)
+                        return -EINVAL;
+        }
+        return 0;
+}
+/**
+ * validate_pnode - validate a pnode.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode to validate
+ * @parent: parent nnode
+ * @iip: index in parent
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int validate_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
+                          struct ubifs_nnode *parent, int iip)
+{
+        int i;
+        if (c->big_lpt) {
+                int num = calc_pnode_num_from_parent(c, parent, iip);
+                if (pnode->num != num)
+                        return -EINVAL;
+        }
+        for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+                int free = pnode->lprops[i].free;
+                int dirty = pnode->lprops[i].dirty;
+                if (free < 0 || free > c->leb_size || free % c->min_io_size ||
+                    (free & 7))
+                        return -EINVAL;
+                if (dirty < 0 || dirty > c->leb_size || (dirty & 7))
+                        return -EINVAL;
+                if (dirty + free > c->leb_size)
+                        return -EINVAL;
+        }
+        return 0;
+}
+/**
+ * set_pnode_lnum - set LEB numbers on a pnode.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode to update
+ *
+ * This function calculates the LEB numbers for the LEB properties it contains
+ * based on the pnode number.
+ */
+static void set_pnode_lnum(struct ubifs_info *c, struct ubifs_pnode *pnode)
+{
+        int i, lnum;
+        lnum = (pnode->num << UBIFS_LPT_FANOUT_SHIFT) + c->main_first;
+        for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+                if (lnum >= c->leb_cnt)
+                        return;
+                pnode->lprops[i].lnum = lnum++;
+        }
+}
+/**
+ * ubifs_read_nnode - read a nnode from flash and link it to the tree in memory.
+ * @c: UBIFS file-system description object
+ * @parent: parent nnode (or NULL for the root)
+ * @iip: index in parent
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip)
+{
+        struct ubifs_nbranch *branch = NULL;
+        struct ubifs_nnode *nnode = NULL;
+        void *buf = c->lpt_nod_buf;
+        int err, lnum, offs;
+        if (parent) {
+                branch = &parent->nbranch[iip];
+                lnum = branch->lnum;
+                offs = branch->offs;
+        } else {
+                lnum = c->lpt_lnum;
+                offs = c->lpt_offs;
+        }
+        nnode = kzalloc(sizeof(struct ubifs_nnode), GFP_NOFS);
+        if (!nnode) {
+                err = -ENOMEM;
+                goto out;
+        }
+        if (lnum == 0) {
+                /*
+                 * This nnode was not written which just means that the LEB
+                 * properties in the subtree below it describe empty LEBs. We
+                 * make the nnode as though we had read it, which in fact means
+                 * doing almost nothing.
+                 */
+                if (c->big_lpt)
+                        nnode->num = calc_nnode_num_from_parent(c, parent, iip);
+        } else {
+                err = ubi_read(c->ubi, lnum, buf, offs, c->nnode_sz);
+                if (err)
+                        goto out;
+                err = unpack_nnode(c, buf, nnode);
+                if (err)
+                        goto out;
+        }
+        err = validate_nnode(c, nnode, parent, iip);
+        if (err)
+                goto out;
+        if (!c->big_lpt)
+                nnode->num = calc_nnode_num_from_parent(c, parent, iip);
+        if (parent) {
+                branch->nnode = nnode;
+                nnode->level = parent->level - 1;
+        } else {
+                c->nroot = nnode;
+                nnode->level = c->lpt_hght;
+        }
+        nnode->parent = parent;
+        nnode->iip = iip;
+        return 0;
+out:
+        ubifs_err("error %d reading nnode at %d:%d", err, lnum, offs);
+        kfree(nnode);
+        return err;
+}
+/**
+ * read_pnode - read a pnode from flash and link it to the tree in memory.
+ * @c: UBIFS file-system description object
+ * @parent: parent nnode
+ * @iip: index in parent
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int read_pnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip)
+{
+        struct ubifs_nbranch *branch;
+        struct ubifs_pnode *pnode = NULL;
+        void *buf = c->lpt_nod_buf;
+        int err, lnum, offs;
+        branch = &parent->nbranch[iip];
+        lnum = branch->lnum;
+        offs = branch->offs;
+        pnode = kzalloc(sizeof(struct ubifs_pnode), GFP_NOFS);
+        if (!pnode) {
+                err = -ENOMEM;
+                goto out;
+        }
+        if (lnum == 0) {
+                /*
+                 * This pnode was not written which just means that the LEB
+                 * properties in it describe empty LEBs. We make the pnode as
+                 * though we had read it.
+                 */
+                int i;
+                if (c->big_lpt)
+                        pnode->num = calc_pnode_num_from_parent(c, parent, iip);
+                for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+                        struct ubifs_lprops * const lprops = &pnode->lprops[i];
+                        lprops->free = c->leb_size;
+                        lprops->flags = ubifs_categorize_lprops(c, lprops);
+                }
+        } else {
+                err = ubi_read(c->ubi, lnum, buf, offs, c->pnode_sz);
+                if (err)
+                        goto out;
+                err = unpack_pnode(c, buf, pnode);
+                if (err)
+                        goto out;
+        }
+        err = validate_pnode(c, pnode, parent, iip);
+        if (err)
+                goto out;
+        if (!c->big_lpt)
+                pnode->num = calc_pnode_num_from_parent(c, parent, iip);
+        branch->pnode = pnode;
+        pnode->parent = parent;
+        pnode->iip = iip;
+        set_pnode_lnum(c, pnode);
+        c->pnodes_have += 1;
+        return 0;
+out:
+        ubifs_err("error %d reading pnode at %d:%d", err, lnum, offs);
+        dbg_dump_pnode(c, pnode, parent, iip);
+        dbg_msg("calc num: %d", calc_pnode_num_from_parent(c, parent, iip));
+        kfree(pnode);
+        return err;
+}
+/**
+ * read_ltab - read LPT's own lprops table.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int read_ltab(struct ubifs_info *c)
+{
+        int err;
+        void *buf;
+        buf = vmalloc(c->ltab_sz);
+        if (!buf)
+                return -ENOMEM;
+        err = ubi_read(c->ubi, c->ltab_lnum, buf, c->ltab_offs, c->ltab_sz);
+        if (err)
+                goto out;
+        err = unpack_ltab(c, buf);
+out:
+        vfree(buf);
+        return err;
+}
+/**
+ * read_lsave - read LPT's save table.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int read_lsave(struct ubifs_info *c)
+{
+        int err, i;
+        void *buf;
+        buf = vmalloc(c->lsave_sz);
+        if (!buf)
+                return -ENOMEM;
+        err = ubi_read(c->ubi, c->lsave_lnum, buf, c->lsave_offs, c->lsave_sz);
+        if (err)
+                goto out;
+        err = unpack_lsave(c, buf);
+        if (err)
+                goto out;
+        for (i = 0; i < c->lsave_cnt; i++) {
+                int lnum = c->lsave[i];
+                /*
+                 * Due to automatic resizing, the values in the lsave table
+                 * could be beyond the volume size - just ignore them.
+                 */
+                if (lnum >= c->leb_cnt)
+                        continue;
+                ubifs_lpt_lookup(c, lnum);
+        }
+out:
+        vfree(buf);
+        return err;
+}
+/**
+ * ubifs_get_nnode - get a nnode.
+ * @c: UBIFS file-system description object
+ * @parent: parent nnode (or NULL for the root)
+ * @iip: index in parent
+ *
+ * This function returns a pointer to the nnode on success or a negative error
+ * code on failure.
+ */
+struct ubifs_nnode *ubifs_get_nnode(struct ubifs_info *c,
+                                    struct ubifs_nnode *parent, int iip)
+{
+        struct ubifs_nbranch *branch;
+        struct ubifs_nnode *nnode;
+        int err;
+        branch = &parent->nbranch[iip];
+        nnode = branch->nnode;
+        if (nnode)
+                return nnode;
+        err = ubifs_read_nnode(c, parent, iip);
+        if (err)
+                return ERR_PTR(err);
+        return branch->nnode;
+}
+/**
+ * ubifs_get_pnode - get a pnode.
+ * @c: UBIFS file-system description object
+ * @parent: parent nnode
+ * @iip: index in parent
+ *
+ * This function returns a pointer to the pnode on success or a negative error
+ * code on failure.
+ */
+struct ubifs_pnode *ubifs_get_pnode(struct ubifs_info *c,
+                                    struct ubifs_nnode *parent, int iip)
+{
+        struct ubifs_nbranch *branch;
+        struct ubifs_pnode *pnode;
+        int err;
+        branch = &parent->nbranch[iip];
+        pnode = branch->pnode;
+        if (pnode)
+                return pnode;
+        err = read_pnode(c, parent, iip);
+        if (err)
+                return ERR_PTR(err);
+        update_cats(c, branch->pnode);
+        return branch->pnode;
+}
+/**
+ * ubifs_lpt_lookup - lookup LEB properties in the LPT.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to lookup
+ *
+ * This function returns a pointer to the LEB properties on success or a
+ * negative error code on failure.
+ */
+struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum)
+{
+        int err, i, h, iip, shft;
+        struct ubifs_nnode *nnode;
+        struct ubifs_pnode *pnode;
+        if (!c->nroot) {
+                err = ubifs_read_nnode(c, NULL, 0);
+                if (err)
+                        return ERR_PTR(err);
+        }
+        nnode = c->nroot;
+        i = lnum - c->main_first;
+        shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT;
+        for (h = 1; h < c->lpt_hght; h++) {
+                iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+                shft -= UBIFS_LPT_FANOUT_SHIFT;
+                nnode = ubifs_get_nnode(c, nnode, iip);
+                if (IS_ERR(nnode))
+                        return ERR_PTR(PTR_ERR(nnode));
+        }
+        iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+        shft -= UBIFS_LPT_FANOUT_SHIFT;
+        pnode = ubifs_get_pnode(c, nnode, iip);
+        if (IS_ERR(pnode))
+                return ERR_PTR(PTR_ERR(pnode));
+        iip = (i & (UBIFS_LPT_FANOUT - 1));
+        dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum,
+               pnode->lprops[iip].free, pnode->lprops[iip].dirty,
+               pnode->lprops[iip].flags);
+        return &pnode->lprops[iip];
+}
+/**
+ * dirty_cow_nnode - ensure a nnode is not being committed.
+ * @c: UBIFS file-system description object
+ * @nnode: nnode to check
+ *
+ * Returns dirtied nnode on success or negative error code on failure.
+ */
+static struct ubifs_nnode *dirty_cow_nnode(struct ubifs_info *c,
+                                           struct ubifs_nnode *nnode)
+{
+        struct ubifs_nnode *n;
+        int i;
+        if (!test_bit(COW_CNODE, &nnode->flags)) {
+                /* nnode is not being committed */
+                if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) {
+                        c->dirty_nn_cnt += 1;
+                        ubifs_add_nnode_dirt(c, nnode);
+                }
+                return nnode;
+        }
+        /* nnode is being committed, so copy it */
+        n = kmalloc(sizeof(struct ubifs_nnode), GFP_NOFS);
+        if (unlikely(!n))
+                return ERR_PTR(-ENOMEM);
+        memcpy(n, nnode, sizeof(struct ubifs_nnode));
+        n->cnext = NULL;
+        __set_bit(DIRTY_CNODE, &n->flags);
+        __clear_bit(COW_CNODE, &n->flags);
+        /* The children now have new parent */
+        for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+                struct ubifs_nbranch *branch = &n->nbranch[i];
+                if (branch->cnode)
+                        branch->cnode->parent = n;
+        }
+        ubifs_assert(!test_bit(OBSOLETE_CNODE, &nnode->flags));
+        __set_bit(OBSOLETE_CNODE, &nnode->flags);
+        c->dirty_nn_cnt += 1;
+        ubifs_add_nnode_dirt(c, nnode);
+        if (nnode->parent)
+                nnode->parent->nbranch[n->iip].nnode = n;
+        else
+                c->nroot = n;
+        return n;
+}
+/**
+ * dirty_cow_pnode - ensure a pnode is not being committed.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode to check
+ *
+ * Returns dirtied pnode on success or negative error code on failure.
+ */
+static struct ubifs_pnode *dirty_cow_pnode(struct ubifs_info *c,
+                                           struct ubifs_pnode *pnode)
+{
+        struct ubifs_pnode *p;
+        if (!test_bit(COW_CNODE, &pnode->flags)) {
+                /* pnode is not being committed */
+                if (!test_and_set_bit(DIRTY_CNODE, &pnode->flags)) {
+                        c->dirty_pn_cnt += 1;
+                        add_pnode_dirt(c, pnode);
+                }
+                return pnode;
+        }
+        /* pnode is being committed, so copy it */
+        p = kmalloc(sizeof(struct ubifs_pnode), GFP_NOFS);
+        if (unlikely(!p))
+                return ERR_PTR(-ENOMEM);
+        memcpy(p, pnode, sizeof(struct ubifs_pnode));
+        p->cnext = NULL;
+        __set_bit(DIRTY_CNODE, &p->flags);
+        __clear_bit(COW_CNODE, &p->flags);
+        replace_cats(c, pnode, p);
+        ubifs_assert(!test_bit(OBSOLETE_CNODE, &pnode->flags));
+        __set_bit(OBSOLETE_CNODE, &pnode->flags);
+        c->dirty_pn_cnt += 1;
+        add_pnode_dirt(c, pnode);
+        pnode->parent->nbranch[p->iip].pnode = p;
+        return p;
+}
+/**
+ * ubifs_lpt_lookup_dirty - lookup LEB properties in the LPT.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to lookup
+ *
+ * This function returns a pointer to the LEB properties on success or a
+ * negative error code on failure.
+ */
+struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum)
+{
+        int err, i, h, iip, shft;
+        struct ubifs_nnode *nnode;
+        struct ubifs_pnode *pnode;
+        if (!c->nroot) {
+                err = ubifs_read_nnode(c, NULL, 0);
+                if (err)
+                        return ERR_PTR(err);
+        }
+        nnode = c->nroot;
+        nnode = dirty_cow_nnode(c, nnode);
+        if (IS_ERR(nnode))
+                return ERR_PTR(PTR_ERR(nnode));
+        i = lnum - c->main_first;
+        shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT;
+        for (h = 1; h < c->lpt_hght; h++) {
+                iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+                shft -= UBIFS_LPT_FANOUT_SHIFT;
+                nnode = ubifs_get_nnode(c, nnode, iip);
+                if (IS_ERR(nnode))
+                        return ERR_PTR(PTR_ERR(nnode));
+                nnode = dirty_cow_nnode(c, nnode);
+                if (IS_ERR(nnode))
+                        return ERR_PTR(PTR_ERR(nnode));
+        }
+        iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+        shft -= UBIFS_LPT_FANOUT_SHIFT;
+        pnode = ubifs_get_pnode(c, nnode, iip);
+        if (IS_ERR(pnode))
+                return ERR_PTR(PTR_ERR(pnode));
+        pnode = dirty_cow_pnode(c, pnode);
+        if (IS_ERR(pnode))
+                return ERR_PTR(PTR_ERR(pnode));
+        iip = (i & (UBIFS_LPT_FANOUT - 1));
+        dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum,
+               pnode->lprops[iip].free, pnode->lprops[iip].dirty,
+               pnode->lprops[iip].flags);
+        ubifs_assert(test_bit(DIRTY_CNODE, &pnode->flags));
+        return &pnode->lprops[iip];
+}
+/**
+ * lpt_init_rd - initialize the LPT for reading.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int lpt_init_rd(struct ubifs_info *c)
+{
+        int err, i;
+        c->ltab = vmalloc(sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs);
+        if (!c->ltab)
+                return -ENOMEM;
+        i = max_t(int, c->nnode_sz, c->pnode_sz);
+        c->lpt_nod_buf = kmalloc(i, GFP_KERNEL);
+        if (!c->lpt_nod_buf)
+                return -ENOMEM;
+        for (i = 0; i < LPROPS_HEAP_CNT; i++) {
+                c->lpt_heap[i].arr = kmalloc(sizeof(void *) * LPT_HEAP_SZ,
+                                             GFP_KERNEL);
+                if (!c->lpt_heap[i].arr)
+                        return -ENOMEM;
+                c->lpt_heap[i].cnt = 0;
+                c->lpt_heap[i].max_cnt = LPT_HEAP_SZ;
+        }
+        c->dirty_idx.arr = kmalloc(sizeof(void *) * LPT_HEAP_SZ, GFP_KERNEL);
+        if (!c->dirty_idx.arr)
+                return -ENOMEM;
+        c->dirty_idx.cnt = 0;
+        c->dirty_idx.max_cnt = LPT_HEAP_SZ;
+        err = read_ltab(c);
+        if (err)
+                return err;
+        dbg_lp("space_bits %d", c->space_bits);
+        dbg_lp("lpt_lnum_bits %d", c->lpt_lnum_bits);
+        dbg_lp("lpt_offs_bits %d", c->lpt_offs_bits);
+        dbg_lp("lpt_spc_bits %d", c->lpt_spc_bits);
+        dbg_lp("pcnt_bits %d", c->pcnt_bits);
+        dbg_lp("lnum_bits %d", c->lnum_bits);
+        dbg_lp("pnode_sz %d", c->pnode_sz);
+        dbg_lp("nnode_sz %d", c->nnode_sz);
+        dbg_lp("ltab_sz %d", c->ltab_sz);
+        dbg_lp("lsave_sz %d", c->lsave_sz);
+        dbg_lp("lsave_cnt %d", c->lsave_cnt);
+        dbg_lp("lpt_hght %d", c->lpt_hght);
+        dbg_lp("big_lpt %d", c->big_lpt);
+        dbg_lp("LPT root is at %d:%d", c->lpt_lnum, c->lpt_offs);
+        dbg_lp("LPT head is at %d:%d", c->nhead_lnum, c->nhead_offs);
+        dbg_lp("LPT ltab is at %d:%d", c->ltab_lnum, c->ltab_offs);
+        if (c->big_lpt)
+                dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs);
+        return 0;
+}
+/**
+ * lpt_init_wr - initialize the LPT for writing.
+ * @c: UBIFS file-system description object
+ *
+ * 'lpt_init_rd()' must have been called already.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int lpt_init_wr(struct ubifs_info *c)
+{
+        int err, i;
+        c->ltab_cmt = vmalloc(sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs);
+        if (!c->ltab_cmt)
+                return -ENOMEM;
+        c->lpt_buf = vmalloc(c->leb_size);
+        if (!c->lpt_buf)
+                return -ENOMEM;
+        if (c->big_lpt) {
+                c->lsave = kmalloc(sizeof(int) * c->lsave_cnt, GFP_NOFS);
+                if (!c->lsave)
+                        return -ENOMEM;
+                err = read_lsave(c);
+                if (err)
+                        return err;
+        }
+        for (i = 0; i < c->lpt_lebs; i++)
+                if (c->ltab[i].free == c->leb_size) {
+                        err = ubifs_leb_unmap(c, i + c->lpt_first);
+                        if (err)
+                                return err;
+                }
+        return 0;
+}
+/**
+ * ubifs_lpt_init - initialize the LPT.
+ * @c: UBIFS file-system description object
+ * @rd: whether to initialize lpt for reading
+ * @wr: whether to initialize lpt for writing
+ *
+ * For mounting 'rw', @rd and @wr are both true. For mounting 'ro', @rd is true
+ * and @wr is false. For mounting from 'ro' to 'rw', @rd is false and @wr is
+ * true.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_lpt_init(struct ubifs_info *c, int rd, int wr)
+{
+        int err;
+        if (rd) {
+                err = lpt_init_rd(c);
+                if (err)
+                        return err;
+        }
+        if (wr) {
+                err = lpt_init_wr(c);
+                if (err)
+                        return err;
+        }
+        return 0;
+}
+/**
+ * struct lpt_scan_node - somewhere to put nodes while we scan LPT.
+ * @nnode: where to keep a nnode
+ * @pnode: where to keep a pnode
+ * @cnode: where to keep a cnode
+ * @in_tree: is the node in the tree in memory
+ * @ptr.nnode: pointer to the nnode (if it is an nnode) which may be here or in
+ * the tree
+ * @ptr.pnode: ditto for pnode
+ * @ptr.cnode: ditto for cnode
+ */
+struct lpt_scan_node {
+        union {
+                struct ubifs_nnode nnode;
+                struct ubifs_pnode pnode;
+                struct ubifs_cnode cnode;
+        };
+        int in_tree;
+        union {
+                struct ubifs_nnode *nnode;
+                struct ubifs_pnode *pnode;
+                struct ubifs_cnode *cnode;
+        } ptr;
+};
+/**
+ * scan_get_nnode - for the scan, get a nnode from either the tree or flash.
+ * @c: the UBIFS file-system description object
+ * @path: where to put the nnode
+ * @parent: parent of the nnode
+ * @iip: index in parent of the nnode
+ *
+ * This function returns a pointer to the nnode on success or a negative error
+ * code on failure.
+ */
+static struct ubifs_nnode *scan_get_nnode(struct ubifs_info *c,
+                                          struct lpt_scan_node *path,
+                                          struct ubifs_nnode *parent, int iip)
+{
+        struct ubifs_nbranch *branch;
+        struct ubifs_nnode *nnode;
+        void *buf = c->lpt_nod_buf;
+        int err;
+        branch = &parent->nbranch[iip];
+        nnode = branch->nnode;
+        if (nnode) {
+                path->in_tree = 1;
+                path->ptr.nnode = nnode;
+                return nnode;
+        }
+        nnode = &path->nnode;
+        path->in_tree = 0;
+        path->ptr.nnode = nnode;
+        memset(nnode, 0, sizeof(struct ubifs_nnode));
+        if (branch->lnum == 0) {
+                /*
+                 * This nnode was not written which just means that the LEB
+                 * properties in the subtree below it describe empty LEBs. We
+                 * make the nnode as though we had read it, which in fact means
+                 * doing almost nothing.
+                 */
+                if (c->big_lpt)
+                        nnode->num = calc_nnode_num_from_parent(c, parent, iip);
+        } else {
+                err = ubi_read(c->ubi, branch->lnum, buf, branch->offs,
+                               c->nnode_sz);
+                if (err)
+                        return ERR_PTR(err);
+                err = unpack_nnode(c, buf, nnode);
+                if (err)
+                        return ERR_PTR(err);
+        }
+        err = validate_nnode(c, nnode, parent, iip);
+        if (err)
+                return ERR_PTR(err);
+        if (!c->big_lpt)
+                nnode->num = calc_nnode_num_from_parent(c, parent, iip);
+        nnode->level = parent->level - 1;
+        nnode->parent = parent;
+        nnode->iip = iip;
+        return nnode;
+}
+/**
+ * scan_get_pnode - for the scan, get a pnode from either the tree or flash.
+ * @c: the UBIFS file-system description object
+ * @path: where to put the pnode
+ * @parent: parent of the pnode
+ * @iip: index in parent of the pnode
+ *
+ * This function returns a pointer to the pnode on success or a negative error
+ * code on failure.
+ */
+static struct ubifs_pnode *scan_get_pnode(struct ubifs_info *c,
+                                          struct lpt_scan_node *path,
+                                          struct ubifs_nnode *parent, int iip)
+{
+        struct ubifs_nbranch *branch;
+        struct ubifs_pnode *pnode;
+        void *buf = c->lpt_nod_buf;
+        int err;
+        branch = &parent->nbranch[iip];
+        pnode = branch->pnode;
+        if (pnode) {
+                path->in_tree = 1;
+                path->ptr.pnode = pnode;
+                return pnode;
+        }
+        pnode = &path->pnode;
+        path->in_tree = 0;
+        path->ptr.pnode = pnode;
+        memset(pnode, 0, sizeof(struct ubifs_pnode));
+        if (branch->lnum == 0) {
+                /*
+                 * This pnode was not written which just means that the LEB
+                 * properties in it describe empty LEBs. We make the pnode as
+                 * though we had read it.
+                 */
+                int i;
+                if (c->big_lpt)
+                        pnode->num = calc_pnode_num_from_parent(c, parent, iip);
+                for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+                        struct ubifs_lprops * const lprops = &pnode->lprops[i];
+                        lprops->free = c->leb_size;
+                        lprops->flags = ubifs_categorize_lprops(c, lprops);
+                }
+        } else {
+                ubifs_assert(branch->lnum >= c->lpt_first &&
+                             branch->lnum <= c->lpt_last);
+                ubifs_assert(branch->offs >= 0 && branch->offs < c->leb_size);
+                err = ubi_read(c->ubi, branch->lnum, buf, branch->offs,
+                               c->pnode_sz);
+                if (err)
+                        return ERR_PTR(err);
+                err = unpack_pnode(c, buf, pnode);
+                if (err)
+                        return ERR_PTR(err);
+        }
+        err = validate_pnode(c, pnode, parent, iip);
+        if (err)
+                return ERR_PTR(err);
+        if (!c->big_lpt)
+                pnode->num = calc_pnode_num_from_parent(c, parent, iip);
+        pnode->parent = parent;
+        pnode->iip = iip;
+        set_pnode_lnum(c, pnode);
+        return pnode;
+}
+/**
+ * ubifs_lpt_scan_nolock - scan the LPT.
+ * @c: the UBIFS file-system description object
+ * @start_lnum: LEB number from which to start scanning
+ * @end_lnum: LEB number at which to stop scanning
+ * @scan_cb: callback function called for each lprops
+ * @data: data to be passed to the callback function
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_lpt_scan_nolock(struct ubifs_info *c, int start_lnum, int end_lnum,
+                          ubifs_lpt_scan_callback scan_cb, void *data)
+{
+        int err = 0, i, h, iip, shft;
+        struct ubifs_nnode *nnode;
+        struct ubifs_pnode *pnode;
+        struct lpt_scan_node *path;
+        if (start_lnum == -1) {
+                start_lnum = end_lnum + 1;
+                if (start_lnum >= c->leb_cnt)
+                        start_lnum = c->main_first;
+        }
+        ubifs_assert(start_lnum >= c->main_first && start_lnum < c->leb_cnt);
+        ubifs_assert(end_lnum >= c->main_first && end_lnum < c->leb_cnt);
+        if (!c->nroot) {
+                err = ubifs_read_nnode(c, NULL, 0);
+                if (err)
+                        return err;
+        }
+        path = kmalloc(sizeof(struct lpt_scan_node) * (c->lpt_hght + 1),
+                       GFP_NOFS);
+        if (!path)
+                return -ENOMEM;
+        path[0].ptr.nnode = c->nroot;
+        path[0].in_tree = 1;
+again:
+        /* Descend to the pnode containing start_lnum */
+        nnode = c->nroot;
+        i = start_lnum - c->main_first;
+        shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT;
+        for (h = 1; h < c->lpt_hght; h++) {
+                iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+                shft -= UBIFS_LPT_FANOUT_SHIFT;
+                nnode = scan_get_nnode(c, path + h, nnode, iip);
+                if (IS_ERR(nnode)) {
+                        err = PTR_ERR(nnode);
+                        goto out;
+                }
+        }
+        iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+        shft -= UBIFS_LPT_FANOUT_SHIFT;
+        pnode = scan_get_pnode(c, path + h, nnode, iip);
+        if (IS_ERR(pnode)) {
+                err = PTR_ERR(pnode);
+                goto out;
+        }
+        iip = (i & (UBIFS_LPT_FANOUT - 1));
+        /* Loop for each lprops */
+        while (1) {
+                struct ubifs_lprops *lprops = &pnode->lprops[iip];
+                int ret, lnum = lprops->lnum;
+                ret = scan_cb(c, lprops, path[h].in_tree, data);
+                if (ret < 0) {
+                        err = ret;
+                        goto out;
+                }
+                if (ret & LPT_SCAN_ADD) {
+                        /* Add all the nodes in path to the tree in memory */
+                        for (h = 1; h < c->lpt_hght; h++) {
+                                const size_t sz = sizeof(struct ubifs_nnode);
+                                struct ubifs_nnode *parent;
+                                if (path[h].in_tree)
+                                        continue;
+                                nnode = kmalloc(sz, GFP_NOFS);
+                                if (!nnode) {
+                                        err = -ENOMEM;
+                                        goto out;
+                                }
+                                memcpy(nnode, &path[h].nnode, sz);
+                                parent = nnode->parent;
+                                parent->nbranch[nnode->iip].nnode = nnode;
+                                path[h].ptr.nnode = nnode;
+                                path[h].in_tree = 1;
+                                path[h + 1].cnode.parent = nnode;
+                        }
+                        if (path[h].in_tree)
+                                ubifs_ensure_cat(c, lprops);
+                        else {
+                                const size_t sz = sizeof(struct ubifs_pnode);
+                                struct ubifs_nnode *parent;
+                                pnode = kmalloc(sz, GFP_NOFS);
+                                if (!pnode) {
+                                        err = -ENOMEM;
+                                        goto out;
+                                }
+                                memcpy(pnode, &path[h].pnode, sz);
+                                parent = pnode->parent;
+                                parent->nbranch[pnode->iip].pnode = pnode;
+                                path[h].ptr.pnode = pnode;
+                                path[h].in_tree = 1;
+                                update_cats(c, pnode);
+                                c->pnodes_have += 1;
+                        }
+                        err = dbg_check_lpt_nodes(c, (struct ubifs_cnode *)
+                                                  c->nroot, 0, 0);
+                        if (err)
+                                goto out;
+                        err = dbg_check_cats(c);
+                        if (err)
+                                goto out;
+                }
+                if (ret & LPT_SCAN_STOP) {
+                        err = 0;
+                        break;
+                }
+                /* Get the next lprops */
+                if (lnum == end_lnum) {
+                        /*
+                         * We got to the end without finding what we were
+                         * looking for
+                         */
+                        err = -ENOSPC;
+                        goto out;
+                }
+                if (lnum + 1 >= c->leb_cnt) {
+                        /* Wrap-around to the beginning */
+                        start_lnum = c->main_first;
+                        goto again;
+                }
+                if (iip + 1 < UBIFS_LPT_FANOUT) {
+                        /* Next lprops is in the same pnode */
+                        iip += 1;
+                        continue;
+                }
+                /* We need to get the next pnode. Go up until we can go right */
+                iip = pnode->iip;
+                while (1) {
+                        h -= 1;
+                        ubifs_assert(h >= 0);
+                        nnode = path[h].ptr.nnode;
+                        if (iip + 1 < UBIFS_LPT_FANOUT)
+                                break;
+                        iip = nnode->iip;
+                }
+                /* Go right */
+                iip += 1;
+                /* Descend to the pnode */
+                h += 1;
+                for (; h < c->lpt_hght; h++) {
+                        nnode = scan_get_nnode(c, path + h, nnode, iip);
+                        if (IS_ERR(nnode)) {
+                                err = PTR_ERR(nnode);
+                                goto out;
+                        }
+                        iip = 0;
+                }
+                pnode = scan_get_pnode(c, path + h, nnode, iip);
+                if (IS_ERR(pnode)) {
+                        err = PTR_ERR(pnode);
+                        goto out;
+                }
+                iip = 0;
+        }
+out:
+        kfree(path);
+        return err;
+}
+#ifdef CONFIG_UBIFS_FS_DEBUG
+/**
+ * dbg_chk_pnode - check a pnode.
+ * @c: the UBIFS file-system description object
+ * @pnode: pnode to check
+ * @col: pnode column
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
+                         int col)
+{
+        int i;
+        if (pnode->num != col) {
+                dbg_err("pnode num %d expected %d parent num %d iip %d",
+                        pnode->num, col, pnode->parent->num, pnode->iip);
+                return -EINVAL;
+        }
+        for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+                struct ubifs_lprops *lp, *lprops = &pnode->lprops[i];
+                int lnum = (pnode->num << UBIFS_LPT_FANOUT_SHIFT) + i +
+                           c->main_first;
+                int found, cat = lprops->flags & LPROPS_CAT_MASK;
+                struct ubifs_lpt_heap *heap;
+                struct list_head *list = NULL;
+                if (lnum >= c->leb_cnt)
+                        continue;
+                if (lprops->lnum != lnum) {
+                        dbg_err("bad LEB number %d expected %d",
+                                lprops->lnum, lnum);
+                        return -EINVAL;
+                }
+                if (lprops->flags & LPROPS_TAKEN) {
+                        if (cat != LPROPS_UNCAT) {
+                                dbg_err("LEB %d taken but not uncat %d",
+                                        lprops->lnum, cat);
+                                return -EINVAL;
+                        }
+                        continue;
+                }
+                if (lprops->flags & LPROPS_INDEX) {
+                        switch (cat) {
+                        case LPROPS_UNCAT:
+                        case LPROPS_DIRTY_IDX:
+                        case LPROPS_FRDI_IDX:
+                                break;
+                        default:
+                                dbg_err("LEB %d index but cat %d",
+                                        lprops->lnum, cat);
+                                return -EINVAL;
+                        }
+                } else {
+                        switch (cat) {
+                        case LPROPS_UNCAT:
+                        case LPROPS_DIRTY:
+                        case LPROPS_FREE:
+                        case LPROPS_EMPTY:
+                        case LPROPS_FREEABLE:
+                                break;
+                        default:
+                                dbg_err("LEB %d not index but cat %d",
+                                        lprops->lnum, cat);
+                                return -EINVAL;
+                        }
+                }
+                switch (cat) {
+                case LPROPS_UNCAT:
+                        list = &c->uncat_list;
+                        break;
+                case LPROPS_EMPTY:
+                        list = &c->empty_list;
+                        break;
+                case LPROPS_FREEABLE:
+                        list = &c->freeable_list;
+                        break;
+                case LPROPS_FRDI_IDX:
+                        list = &c->frdi_idx_list;
+                        break;
+                }
+                found = 0;
+                switch (cat) {
+                case LPROPS_DIRTY:
+                case LPROPS_DIRTY_IDX:
+                case LPROPS_FREE:
+                        heap = &c->lpt_heap[cat - 1];
+                        if (lprops->hpos < heap->cnt &&
+                            heap->arr[lprops->hpos] == lprops)
+                                found = 1;
+                        break;
+                case LPROPS_UNCAT:
+                case LPROPS_EMPTY:
+                case LPROPS_FREEABLE:
+                case LPROPS_FRDI_IDX:
+                        list_for_each_entry(lp, list, list)
+                                if (lprops == lp) {
+                                        found = 1;
+                                        break;
+                                }
+                        break;
+                }
+                if (!found) {
+                        dbg_err("LEB %d cat %d not found in cat heap/list",
+                                lprops->lnum, cat);
+                        return -EINVAL;
+                }
+                switch (cat) {
+                case LPROPS_EMPTY:
+                        if (lprops->free != c->leb_size) {
+                                dbg_err("LEB %d cat %d free %d dirty %d",
+                                        lprops->lnum, cat, lprops->free,
+                                        lprops->dirty);
+                                return -EINVAL;
+                        }
+                case LPROPS_FREEABLE:
+                case LPROPS_FRDI_IDX:
+                        if (lprops->free + lprops->dirty != c->leb_size) {
+                                dbg_err("LEB %d cat %d free %d dirty %d",
+                                        lprops->lnum, cat, lprops->free,
+                                        lprops->dirty);
+                                return -EINVAL;
+                        }
+                }
+        }
+        return 0;
+}
+/**
+ * dbg_check_lpt_nodes - check nnodes and pnodes.
+ * @c: the UBIFS file-system description object
+ * @cnode: next cnode (nnode or pnode) to check
+ * @row: row of cnode (root is zero)
+ * @col: column of cnode (leftmost is zero)
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode,
+                        int row, int col)
+{
+        struct ubifs_nnode *nnode, *nn;
+        struct ubifs_cnode *cn;
+        int num, iip = 0, err;
+        if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
+                return 0;
+        while (cnode) {
+                ubifs_assert(row >= 0);
+                nnode = cnode->parent;
+                if (cnode->level) {
+                        /* cnode is a nnode */
+                        num = calc_nnode_num(row, col);
+                        if (cnode->num != num) {
+                                dbg_err("nnode num %d expected %d "
+                                        "parent num %d iip %d", cnode->num, num,
+                                        (nnode ? nnode->num : 0), cnode->iip);
+                                return -EINVAL;
+                        }
+                        nn = (struct ubifs_nnode *)cnode;
+                        while (iip < UBIFS_LPT_FANOUT) {
+                                cn = nn->nbranch[iip].cnode;
+                                if (cn) {
+                                        /* Go down */
+                                        row += 1;
+                                        col <<= UBIFS_LPT_FANOUT_SHIFT;
+                                        col += iip;
+                                        iip = 0;
+                                        cnode = cn;
+                                        break;
+                                }
+                                /* Go right */
+                                iip += 1;
+                        }
+                        if (iip < UBIFS_LPT_FANOUT)
+                                continue;
+                } else {
+                        struct ubifs_pnode *pnode;
+                        /* cnode is a pnode */
+                        pnode = (struct ubifs_pnode *)cnode;
+                        err = dbg_chk_pnode(c, pnode, col);
+                        if (err)
+                                return err;
+                }
+                /* Go up and to the right */
+                row -= 1;
+                col >>= UBIFS_LPT_FANOUT_SHIFT;
+                iip = cnode->iip + 1;
+                cnode = (struct ubifs_cnode *)nnode;
+        }
+        return 0;
+}
+#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
new file mode 100644
index 000000000000..5f0b83e20af6
--- /dev/null
+++ b/fs/ubifs/lpt_commit.c
@@ -0,0 +1,1648 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+/*
+ * This file implements commit-related functionality of the LEB properties
+ * subsystem.
+ */
+#include <linux/crc16.h>
+#include "ubifs.h"
+/**
+ * first_dirty_cnode - find first dirty cnode.
+ * @c: UBIFS file-system description object
+ * @nnode: nnode at which to start
+ *
+ * This function returns the first dirty cnode or %NULL if there is not one.
+ */
+static struct ubifs_cnode *first_dirty_cnode(struct ubifs_nnode *nnode)
+{
+        ubifs_assert(nnode);
+        while (1) {
+                int i, cont = 0;
+                for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+                        struct ubifs_cnode *cnode;
+                        cnode = nnode->nbranch[i].cnode;
+                        if (cnode &&
+                            test_bit(DIRTY_CNODE, &cnode->flags)) {
+                                if (cnode->level == 0)
+                                        return cnode;
+                                nnode = (struct ubifs_nnode *)cnode;
+                                cont = 1;
+                                break;
+                        }
+                }
+                if (!cont)
+                        return (struct ubifs_cnode *)nnode;
+        }
+}
+/**
+ * next_dirty_cnode - find next dirty cnode.
+ * @cnode: cnode from which to begin searching
+ *
+ * This function returns the next dirty cnode or %NULL if there is not one.
+ */
+static struct ubifs_cnode *next_dirty_cnode(struct ubifs_cnode *cnode)
+{
+        struct ubifs_nnode *nnode;
+        int i;
+        ubifs_assert(cnode);
+        nnode = cnode->parent;
+        if (!nnode)
+                return NULL;
+        for (i = cnode->iip + 1; i < UBIFS_LPT_FANOUT; i++) {
+                cnode = nnode->nbranch[i].cnode;
+                if (cnode && test_bit(DIRTY_CNODE, &cnode->flags)) {
+                        if (cnode->level == 0)
+                                return cnode; /* cnode is a pnode */
+                        /* cnode is a nnode */
+                        return first_dirty_cnode((struct ubifs_nnode *)cnode);
+                }
+        }
+        return (struct ubifs_cnode *)nnode;
+}
+/**
+ * get_cnodes_to_commit - create list of dirty cnodes to commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns the number of cnodes to commit.
+ */
+static int get_cnodes_to_commit(struct ubifs_info *c)
+{
+        struct ubifs_cnode *cnode, *cnext;
+        int cnt = 0;
+        if (!c->nroot)
+                return 0;
+        if (!test_bit(DIRTY_CNODE, &c->nroot->flags))
+                return 0;
+        c->lpt_cnext = first_dirty_cnode(c->nroot);
+        cnode = c->lpt_cnext;
+        if (!cnode)
+                return 0;
+        cnt += 1;
+        while (1) {
+                ubifs_assert(!test_bit(COW_ZNODE, &cnode->flags));
+                __set_bit(COW_ZNODE, &cnode->flags);
+                cnext = next_dirty_cnode(cnode);
+                if (!cnext) {
+                        cnode->cnext = c->lpt_cnext;
+                        break;
+                }
+                cnode->cnext = cnext;
+                cnode = cnext;
+                cnt += 1;
+        }
+        dbg_cmt("committing %d cnodes", cnt);
+        dbg_lp("committing %d cnodes", cnt);
+        ubifs_assert(cnt == c->dirty_nn_cnt + c->dirty_pn_cnt);
+        return cnt;
+}
+/**
+ * upd_ltab - update LPT LEB properties.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number
+ * @free: amount of free space
+ * @dirty: amount of dirty space to add
+ */
+static void upd_ltab(struct ubifs_info *c, int lnum, int free, int dirty)
+{
+        dbg_lp("LEB %d free %d dirty %d to %d +%d",
+               lnum, c->ltab[lnum - c->lpt_first].free,
+               c->ltab[lnum - c->lpt_first].dirty, free, dirty);
+        ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last);
+        c->ltab[lnum - c->lpt_first].free = free;
+        c->ltab[lnum - c->lpt_first].dirty += dirty;
+}
+/**
+ * alloc_lpt_leb - allocate an LPT LEB that is empty.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number is passed and returned here
+ *
+ * This function finds the next empty LEB in the ltab starting from @lnum. If a
+ * an empty LEB is found it is returned in @lnum and the function returns %0.
+ * Otherwise the function returns -ENOSPC.  Note however, that LPT is designed
+ * never to run out of space.
+ */
+static int alloc_lpt_leb(struct ubifs_info *c, int *lnum)
+{
+        int i, n;
+        n = *lnum - c->lpt_first + 1;
+        for (i = n; i < c->lpt_lebs; i++) {
+                if (c->ltab[i].tgc || c->ltab[i].cmt)
+                        continue;
+                if (c->ltab[i].free == c->leb_size) {
+                        c->ltab[i].cmt = 1;
+                        *lnum = i + c->lpt_first;
+                        return 0;
+                }
+        }
+        for (i = 0; i < n; i++) {
+                if (c->ltab[i].tgc || c->ltab[i].cmt)
+                        continue;
+                if (c->ltab[i].free == c->leb_size) {
+                        c->ltab[i].cmt = 1;
+                        *lnum = i + c->lpt_first;
+                        return 0;
+                }
+        }
+        dbg_err("last LEB %d", *lnum);
+        dump_stack();
+        return -ENOSPC;
+}
+/**
+ * layout_cnodes - layout cnodes for commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int layout_cnodes(struct ubifs_info *c)
+{
+        int lnum, offs, len, alen, done_lsave, done_ltab, err;
+        struct ubifs_cnode *cnode;
+        cnode = c->lpt_cnext;
+        if (!cnode)
+                return 0;
+        lnum = c->nhead_lnum;
+        offs = c->nhead_offs;
+        /* Try to place lsave and ltab nicely */
+        done_lsave = !c->big_lpt;
+        done_ltab = 0;
+        if (!done_lsave && offs + c->lsave_sz <= c->leb_size) {
+                done_lsave = 1;
+                c->lsave_lnum = lnum;
+                c->lsave_offs = offs;
+                offs += c->lsave_sz;
+        }
+        if (offs + c->ltab_sz <= c->leb_size) {
+                done_ltab = 1;
+                c->ltab_lnum = lnum;
+                c->ltab_offs = offs;
+                offs += c->ltab_sz;
+        }
+        do {
+                if (cnode->level) {
+                        len = c->nnode_sz;
+                        c->dirty_nn_cnt -= 1;
+                } else {
+                        len = c->pnode_sz;
+                        c->dirty_pn_cnt -= 1;
+                }
+                while (offs + len > c->leb_size) {
+                        alen = ALIGN(offs, c->min_io_size);
+                        upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
+                        err = alloc_lpt_leb(c, &lnum);
+                        if (err)
+                                return err;
+                        offs = 0;
+                        ubifs_assert(lnum >= c->lpt_first &&
+                                     lnum <= c->lpt_last);
+                        /* Try to place lsave and ltab nicely */
+                        if (!done_lsave) {
+                                done_lsave = 1;
+                                c->lsave_lnum = lnum;
+                                c->lsave_offs = offs;
+                                offs += c->lsave_sz;
+                                continue;
+                        }
+                        if (!done_ltab) {
+                                done_ltab = 1;
+                                c->ltab_lnum = lnum;
+                                c->ltab_offs = offs;
+                                offs += c->ltab_sz;
+                                continue;
+                        }
+                        break;
+                }
+                if (cnode->parent) {
+                        cnode->parent->nbranch[cnode->iip].lnum = lnum;
+                        cnode->parent->nbranch[cnode->iip].offs = offs;
+                } else {
+                        c->lpt_lnum = lnum;
+                        c->lpt_offs = offs;
+                }
+                offs += len;
+                cnode = cnode->cnext;
+        } while (cnode && cnode != c->lpt_cnext);
+        /* Make sure to place LPT's save table */
+        if (!done_lsave) {
+                if (offs + c->lsave_sz > c->leb_size) {
+                        alen = ALIGN(offs, c->min_io_size);
+                        upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
+                        err = alloc_lpt_leb(c, &lnum);
+                        if (err)
+                                return err;
+                        offs = 0;
+                        ubifs_assert(lnum >= c->lpt_first &&
+                                     lnum <= c->lpt_last);
+                }
+                done_lsave = 1;
+                c->lsave_lnum = lnum;
+                c->lsave_offs = offs;
+                offs += c->lsave_sz;
+        }
+        /* Make sure to place LPT's own lprops table */
+        if (!done_ltab) {
+                if (offs + c->ltab_sz > c->leb_size) {
+                        alen = ALIGN(offs, c->min_io_size);
+                        upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
+                        err = alloc_lpt_leb(c, &lnum);
+                        if (err)
+                                return err;
+                        offs = 0;
+                        ubifs_assert(lnum >= c->lpt_first &&
+                                     lnum <= c->lpt_last);
+                }
+                done_ltab = 1;
+                c->ltab_lnum = lnum;
+                c->ltab_offs = offs;
+                offs += c->ltab_sz;
+        }
+        alen = ALIGN(offs, c->min_io_size);
+        upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
+        return 0;
+}
+/**
+ * realloc_lpt_leb - allocate an LPT LEB that is empty.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number is passed and returned here
+ *
+ * This function duplicates exactly the results of the function alloc_lpt_leb.
+ * It is used during end commit to reallocate the same LEB numbers that were
+ * allocated by alloc_lpt_leb during start commit.
+ *
+ * This function finds the next LEB that was allocated by the alloc_lpt_leb
+ * function starting from @lnum. If a LEB is found it is returned in @lnum and
+ * the function returns %0. Otherwise the function returns -ENOSPC.
+ * Note however, that LPT is designed never to run out of space.
+ */
+static int realloc_lpt_leb(struct ubifs_info *c, int *lnum)
+{
+        int i, n;
+        n = *lnum - c->lpt_first + 1;
+        for (i = n; i < c->lpt_lebs; i++)
+                if (c->ltab[i].cmt) {
+                        c->ltab[i].cmt = 0;
+                        *lnum = i + c->lpt_first;
+                        return 0;
+                }
+        for (i = 0; i < n; i++)
+                if (c->ltab[i].cmt) {
+                        c->ltab[i].cmt = 0;
+                        *lnum = i + c->lpt_first;
+                        return 0;
+                }
+        dbg_err("last LEB %d", *lnum);
+        dump_stack();
+        return -ENOSPC;
+}
+/**
+ * write_cnodes - write cnodes for commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int write_cnodes(struct ubifs_info *c)
+{
+        int lnum, offs, len, from, err, wlen, alen, done_ltab, done_lsave;
+        struct ubifs_cnode *cnode;
+        void *buf = c->lpt_buf;
+        cnode = c->lpt_cnext;
+        if (!cnode)
+                return 0;
+        lnum = c->nhead_lnum;
+        offs = c->nhead_offs;
+        from = offs;
+        /* Ensure empty LEB is unmapped */
+        if (offs == 0) {
+                err = ubifs_leb_unmap(c, lnum);
+                if (err)
+                        return err;
+        }
+        /* Try to place lsave and ltab nicely */
+        done_lsave = !c->big_lpt;
+        done_ltab = 0;
+        if (!done_lsave && offs + c->lsave_sz <= c->leb_size) {
+                done_lsave = 1;
+                ubifs_pack_lsave(c, buf + offs, c->lsave);
+                offs += c->lsave_sz;
+        }
+        if (offs + c->ltab_sz <= c->leb_size) {
+                done_ltab = 1;
+                ubifs_pack_ltab(c, buf + offs, c->ltab_cmt);
+                offs += c->ltab_sz;
+        }
+        /* Loop for each cnode */
+        do {
+                if (cnode->level)
+                        len = c->nnode_sz;
+                else
+                        len = c->pnode_sz;
+                while (offs + len > c->leb_size) {
+                        wlen = offs - from;
+                        if (wlen) {
+                                alen = ALIGN(wlen, c->min_io_size);
+                                memset(buf + offs, 0xff, alen - wlen);
+                                err = ubifs_leb_write(c, lnum, buf + from, from,
+                                                       alen, UBI_SHORTTERM);
+                                if (err)
+                                        return err;
+                        }
+                        err = realloc_lpt_leb(c, &lnum);
+                        if (err)
+                                return err;
+                        offs = 0;
+                        from = 0;
+                        ubifs_assert(lnum >= c->lpt_first &&
+                                     lnum <= c->lpt_last);
+                        err = ubifs_leb_unmap(c, lnum);
+                        if (err)
+                                return err;
+                        /* Try to place lsave and ltab nicely */
+                        if (!done_lsave) {
+                                done_lsave = 1;
+                                ubifs_pack_lsave(c, buf + offs, c->lsave);
+                                offs += c->lsave_sz;
+                                continue;
+                        }
+                        if (!done_ltab) {
+                                done_ltab = 1;
+                                ubifs_pack_ltab(c, buf + offs, c->ltab_cmt);
+                                offs += c->ltab_sz;
+                                continue;
+                        }
+                        break;
+                }
+                if (cnode->level)
+                        ubifs_pack_nnode(c, buf + offs,
+                                         (struct ubifs_nnode *)cnode);
+                else
+                        ubifs_pack_pnode(c, buf + offs,
+                                         (struct ubifs_pnode *)cnode);
+                /*
+                 * The reason for the barriers is the same as in case of TNC.
+                 * See comment in 'write_index()'. 'dirty_cow_nnode()' and
+                 * 'dirty_cow_pnode()' are the functions for which this is
+                 * important.
+                 */
+                clear_bit(DIRTY_CNODE, &cnode->flags);
+                smp_mb__before_clear_bit();
+                clear_bit(COW_ZNODE, &cnode->flags);
+                smp_mb__after_clear_bit();
+                offs += len;
+                cnode = cnode->cnext;
+        } while (cnode && cnode != c->lpt_cnext);
+        /* Make sure to place LPT's save table */
+        if (!done_lsave) {
+                if (offs + c->lsave_sz > c->leb_size) {
+                        wlen = offs - from;
+                        alen = ALIGN(wlen, c->min_io_size);
+                        memset(buf + offs, 0xff, alen - wlen);
+                        err = ubifs_leb_write(c, lnum, buf + from, from, alen,
+                                              UBI_SHORTTERM);
+                        if (err)
+                                return err;
+                        err = realloc_lpt_leb(c, &lnum);
+                        if (err)
+                                return err;
+                        offs = 0;
+                        ubifs_assert(lnum >= c->lpt_first &&
+                                     lnum <= c->lpt_last);
+                        err = ubifs_leb_unmap(c, lnum);
+                        if (err)
+                                return err;
+                }
+                done_lsave = 1;
+                ubifs_pack_lsave(c, buf + offs, c->lsave);
+                offs += c->lsave_sz;
+        }
+        /* Make sure to place LPT's own lprops table */
+        if (!done_ltab) {
+                if (offs + c->ltab_sz > c->leb_size) {
+                        wlen = offs - from;
+                        alen = ALIGN(wlen, c->min_io_size);
+                        memset(buf + offs, 0xff, alen - wlen);
+                        err = ubifs_leb_write(c, lnum, buf + from, from, alen,
+                                              UBI_SHORTTERM);
+                        if (err)
+                                return err;
+                        err = realloc_lpt_leb(c, &lnum);
+                        if (err)
+                                return err;
+                        offs = 0;
+                        ubifs_assert(lnum >= c->lpt_first &&
+                                     lnum <= c->lpt_last);
+                        err = ubifs_leb_unmap(c, lnum);
+                        if (err)
+                                return err;
+                }
+                done_ltab = 1;
+                ubifs_pack_ltab(c, buf + offs, c->ltab_cmt);
+                offs += c->ltab_sz;
+        }
+        /* Write remaining data in buffer */
+        wlen = offs - from;
+        alen = ALIGN(wlen, c->min_io_size);
+        memset(buf + offs, 0xff, alen - wlen);
+        err = ubifs_leb_write(c, lnum, buf + from, from, alen, UBI_SHORTTERM);
+        if (err)
+                return err;
+        c->nhead_lnum = lnum;
+        c->nhead_offs = ALIGN(offs, c->min_io_size);
+        dbg_lp("LPT root is at %d:%d", c->lpt_lnum, c->lpt_offs);
+        dbg_lp("LPT head is at %d:%d", c->nhead_lnum, c->nhead_offs);
+        dbg_lp("LPT ltab is at %d:%d", c->ltab_lnum, c->ltab_offs);
+        if (c->big_lpt)
+                dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs);
+        return 0;
+}
+/**
+ * next_pnode - find next pnode.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode
+ *
+ * This function returns the next pnode or %NULL if there are no more pnodes.
+ */
+static struct ubifs_pnode *next_pnode(struct ubifs_info *c,
+                                      struct ubifs_pnode *pnode)
+{
+        struct ubifs_nnode *nnode;
+        int iip;
+        /* Try to go right */
+        nnode = pnode->parent;
+        iip = pnode->iip + 1;
+        if (iip < UBIFS_LPT_FANOUT) {
+                /* We assume here that LEB zero is never an LPT LEB */
+                if (nnode->nbranch[iip].lnum)
+                        return ubifs_get_pnode(c, nnode, iip);
+                else
+                        return NULL;
+        }
+        /* Go up while can't go right */
+        do {
+                iip = nnode->iip + 1;
+                nnode = nnode->parent;
+                if (!nnode)
+                        return NULL;
+                /* We assume here that LEB zero is never an LPT LEB */
+        } while (iip >= UBIFS_LPT_FANOUT || !nnode->nbranch[iip].lnum);
+        /* Go right */
+        nnode = ubifs_get_nnode(c, nnode, iip);
+        if (IS_ERR(nnode))
+                return (void *)nnode;
+        /* Go down to level 1 */
+        while (nnode->level > 1) {
+                nnode = ubifs_get_nnode(c, nnode, 0);
+                if (IS_ERR(nnode))
+                        return (void *)nnode;
+        }
+        return ubifs_get_pnode(c, nnode, 0);
+}
+/**
+ * pnode_lookup - lookup a pnode in the LPT.
+ * @c: UBIFS file-system description object
+ * @i: pnode number (0 to main_lebs - 1)
+ *
+ * This function returns a pointer to the pnode on success or a negative
+ * error code on failure.
+ */
+static struct ubifs_pnode *pnode_lookup(struct ubifs_info *c, int i)
+{
+        int err, h, iip, shft;
+        struct ubifs_nnode *nnode;
+        if (!c->nroot) {
+                err = ubifs_read_nnode(c, NULL, 0);
+                if (err)
+                        return ERR_PTR(err);
+        }
+        i <<= UBIFS_LPT_FANOUT_SHIFT;
+        nnode = c->nroot;
+        shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT;
+        for (h = 1; h < c->lpt_hght; h++) {
+                iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+                shft -= UBIFS_LPT_FANOUT_SHIFT;
+                nnode = ubifs_get_nnode(c, nnode, iip);
+                if (IS_ERR(nnode))
+                        return ERR_PTR(PTR_ERR(nnode));
+        }
+        iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+        return ubifs_get_pnode(c, nnode, iip);
+}
+/**
+ * add_pnode_dirt - add dirty space to LPT LEB properties.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode for which to add dirt
+ */
+static void add_pnode_dirt(struct ubifs_info *c, struct ubifs_pnode *pnode)
+{
+        ubifs_add_lpt_dirt(c, pnode->parent->nbranch[pnode->iip].lnum,
+                           c->pnode_sz);
+}
+/**
+ * do_make_pnode_dirty - mark a pnode dirty.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode to mark dirty
+ */
+static void do_make_pnode_dirty(struct ubifs_info *c, struct ubifs_pnode *pnode)
+{
+        /* Assumes cnext list is empty i.e. not called during commit */
+        if (!test_and_set_bit(DIRTY_CNODE, &pnode->flags)) {
+                struct ubifs_nnode *nnode;
+                c->dirty_pn_cnt += 1;
+                add_pnode_dirt(c, pnode);
+                /* Mark parent and ancestors dirty too */
+                nnode = pnode->parent;
+                while (nnode) {
+                        if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) {
+                                c->dirty_nn_cnt += 1;
+                                ubifs_add_nnode_dirt(c, nnode);
+                                nnode = nnode->parent;
+                        } else
+                                break;
+                }
+        }
+}
+/**
+ * make_tree_dirty - mark the entire LEB properties tree dirty.
+ * @c: UBIFS file-system description object
+ *
+ * This function is used by the "small" LPT model to cause the entire LEB
+ * properties tree to be written.  The "small" LPT model does not use LPT
+ * garbage collection because it is more efficient to write the entire tree
+ * (because it is small).
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int make_tree_dirty(struct ubifs_info *c)
+{
+        struct ubifs_pnode *pnode;
+        pnode = pnode_lookup(c, 0);
+        while (pnode) {
+                do_make_pnode_dirty(c, pnode);
+                pnode = next_pnode(c, pnode);
+                if (IS_ERR(pnode))
+                        return PTR_ERR(pnode);
+        }
+        return 0;
+}
+/**
+ * need_write_all - determine if the LPT area is running out of free space.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %1 if the LPT area is running out of free space and %0
+ * if it is not.
+ */
+static int need_write_all(struct ubifs_info *c)
+{
+        long long free = 0;
+        int i;
+        for (i = 0; i < c->lpt_lebs; i++) {
+                if (i + c->lpt_first == c->nhead_lnum)
+                        free += c->leb_size - c->nhead_offs;
+                else if (c->ltab[i].free == c->leb_size)
+                        free += c->leb_size;
+                else if (c->ltab[i].free + c->ltab[i].dirty == c->leb_size)
+                        free += c->leb_size;
+        }
+        /* Less than twice the size left */
+        if (free <= c->lpt_sz * 2)
+                return 1;
+        return 0;
+}
+/**
+ * lpt_tgc_start - start trivial garbage collection of LPT LEBs.
+ * @c: UBIFS file-system description object
+ *
+ * LPT trivial garbage collection is where a LPT LEB contains only dirty and
+ * free space and so may be reused as soon as the next commit is completed.
+ * This function is called during start commit to mark LPT LEBs for trivial GC.
+ */
+static void lpt_tgc_start(struct ubifs_info *c)
+{
+        int i;
+        for (i = 0; i < c->lpt_lebs; i++) {
+                if (i + c->lpt_first == c->nhead_lnum)
+                        continue;
+                if (c->ltab[i].dirty > 0 &&
+                    c->ltab[i].free + c->ltab[i].dirty == c->leb_size) {
+                        c->ltab[i].tgc = 1;
+                        c->ltab[i].free = c->leb_size;
+                        c->ltab[i].dirty = 0;
+                        dbg_lp("LEB %d", i + c->lpt_first);
+                }
+        }
+}
+/**
+ * lpt_tgc_end - end trivial garbage collection of LPT LEBs.
+ * @c: UBIFS file-system description object
+ *
+ * LPT trivial garbage collection is where a LPT LEB contains only dirty and
+ * free space and so may be reused as soon as the next commit is completed.
+ * This function is called after the commit is completed (master node has been
+ * written) and unmaps LPT LEBs that were marked for trivial GC.
+ */
+static int lpt_tgc_end(struct ubifs_info *c)
+{
+        int i, err;
+        for (i = 0; i < c->lpt_lebs; i++)
+                if (c->ltab[i].tgc) {
+                        err = ubifs_leb_unmap(c, i + c->lpt_first);
+                        if (err)
+                                return err;
+                        c->ltab[i].tgc = 0;
+                        dbg_lp("LEB %d", i + c->lpt_first);
+                }
+        return 0;
+}
+/**
+ * populate_lsave - fill the lsave array with important LEB numbers.
+ * @c: the UBIFS file-system description object
+ *
+ * This function is only called for the "big" model. It records a small number
+ * of LEB numbers of important LEBs.  Important LEBs are ones that are (from
+ * most important to least important): empty, freeable, freeable index, dirty
+ * index, dirty or free. Upon mount, we read this list of LEB numbers and bring
+ * their pnodes into memory.  That will stop us from having to scan the LPT
+ * straight away. For the "small" model we assume that scanning the LPT is no
+ * big deal.
+ */
+static void populate_lsave(struct ubifs_info *c)
+{
+        struct ubifs_lprops *lprops;
+        struct ubifs_lpt_heap *heap;
+        int i, cnt = 0;
+        ubifs_assert(c->big_lpt);
+        if (!(c->lpt_drty_flgs & LSAVE_DIRTY)) {
+                c->lpt_drty_flgs |= LSAVE_DIRTY;
+                ubifs_add_lpt_dirt(c, c->lsave_lnum, c->lsave_sz);
+        }
+        list_for_each_entry(lprops, &c->empty_list, list) {
+                c->lsave[cnt++] = lprops->lnum;
+                if (cnt >= c->lsave_cnt)
+                        return;
+        }
+        list_for_each_entry(lprops, &c->freeable_list, list) {
+                c->lsave[cnt++] = lprops->lnum;
+                if (cnt >= c->lsave_cnt)
+                        return;
+        }
+        list_for_each_entry(lprops, &c->frdi_idx_list, list) {
+                c->lsave[cnt++] = lprops->lnum;
+                if (cnt >= c->lsave_cnt)
+                        return;
+        }
+        heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1];
+        for (i = 0; i < heap->cnt; i++) {
+                c->lsave[cnt++] = heap->arr[i]->lnum;
+                if (cnt >= c->lsave_cnt)
+                        return;
+        }
+        heap = &c->lpt_heap[LPROPS_DIRTY - 1];
+        for (i = 0; i < heap->cnt; i++) {
+                c->lsave[cnt++] = heap->arr[i]->lnum;
+                if (cnt >= c->lsave_cnt)
+                        return;
+        }
+        heap = &c->lpt_heap[LPROPS_FREE - 1];
+        for (i = 0; i < heap->cnt; i++) {
+                c->lsave[cnt++] = heap->arr[i]->lnum;
+                if (cnt >= c->lsave_cnt)
+                        return;
+        }
+        /* Fill it up completely */
+        while (cnt < c->lsave_cnt)
+                c->lsave[cnt++] = c->main_first;
+}
+/**
+ * nnode_lookup - lookup a nnode in the LPT.
+ * @c: UBIFS file-system description object
+ * @i: nnode number
+ *
+ * This function returns a pointer to the nnode on success or a negative
+ * error code on failure.
+ */
+static struct ubifs_nnode *nnode_lookup(struct ubifs_info *c, int i)
+{
+        int err, iip;
+        struct ubifs_nnode *nnode;
+        if (!c->nroot) {
+                err = ubifs_read_nnode(c, NULL, 0);
+                if (err)
+                        return ERR_PTR(err);
+        }
+        nnode = c->nroot;
+        while (1) {
+                iip = i & (UBIFS_LPT_FANOUT - 1);
+                i >>= UBIFS_LPT_FANOUT_SHIFT;
+                if (!i)
+                        break;
+                nnode = ubifs_get_nnode(c, nnode, iip);
+                if (IS_ERR(nnode))
+                        return nnode;
+        }
+        return nnode;
+}
+/**
+ * make_nnode_dirty - find a nnode and, if found, make it dirty.
+ * @c: UBIFS file-system description object
+ * @node_num: nnode number of nnode to make dirty
+ * @lnum: LEB number where nnode was written
+ * @offs: offset where nnode was written
+ *
+ * This function is used by LPT garbage collection.  LPT garbage collection is
+ * used only for the "big" LPT model (c->big_lpt == 1).  Garbage collection
+ * simply involves marking all the nodes in the LEB being garbage-collected as
+ * dirty.  The dirty nodes are written next commit, after which the LEB is free
+ * to be reused.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int make_nnode_dirty(struct ubifs_info *c, int node_num, int lnum,
+                            int offs)
+{
+        struct ubifs_nnode *nnode;
+        nnode = nnode_lookup(c, node_num);
+        if (IS_ERR(nnode))
+                return PTR_ERR(nnode);
+        if (nnode->parent) {
+                struct ubifs_nbranch *branch;
+                branch = &nnode->parent->nbranch[nnode->iip];
+                if (branch->lnum != lnum || branch->offs != offs)
+                        return 0; /* nnode is obsolete */
+        } else if (c->lpt_lnum != lnum || c->lpt_offs != offs)
+                        return 0; /* nnode is obsolete */
+        /* Assumes cnext list is empty i.e. not called during commit */
+        if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) {
+                c->dirty_nn_cnt += 1;
+                ubifs_add_nnode_dirt(c, nnode);
+                /* Mark parent and ancestors dirty too */
+                nnode = nnode->parent;
+                while (nnode) {
+                        if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) {
+                                c->dirty_nn_cnt += 1;
+                                ubifs_add_nnode_dirt(c, nnode);
+                                nnode = nnode->parent;
+                        } else
+                                break;
+                }
+        }
+        return 0;
+}
+/**
+ * make_pnode_dirty - find a pnode and, if found, make it dirty.
+ * @c: UBIFS file-system description object
+ * @node_num: pnode number of pnode to make dirty
+ * @lnum: LEB number where pnode was written
+ * @offs: offset where pnode was written
+ *
+ * This function is used by LPT garbage collection.  LPT garbage collection is
+ * used only for the "big" LPT model (c->big_lpt == 1).  Garbage collection
+ * simply involves marking all the nodes in the LEB being garbage-collected as
+ * dirty.  The dirty nodes are written next commit, after which the LEB is free
+ * to be reused.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int make_pnode_dirty(struct ubifs_info *c, int node_num, int lnum,
+                            int offs)
+{
+        struct ubifs_pnode *pnode;
+        struct ubifs_nbranch *branch;
+        pnode = pnode_lookup(c, node_num);
+        if (IS_ERR(pnode))
+                return PTR_ERR(pnode);
+        branch = &pnode->parent->nbranch[pnode->iip];
+        if (branch->lnum != lnum || branch->offs != offs)
+                return 0;
+        do_make_pnode_dirty(c, pnode);
+        return 0;
+}
+/**
+ * make_ltab_dirty - make ltab node dirty.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number where ltab was written
+ * @offs: offset where ltab was written
+ *
+ * This function is used by LPT garbage collection.  LPT garbage collection is
+ * used only for the "big" LPT model (c->big_lpt == 1).  Garbage collection
+ * simply involves marking all the nodes in the LEB being garbage-collected as
+ * dirty.  The dirty nodes are written next commit, after which the LEB is free
+ * to be reused.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int make_ltab_dirty(struct ubifs_info *c, int lnum, int offs)
+{
+        if (lnum != c->ltab_lnum || offs != c->ltab_offs)
+                return 0; /* This ltab node is obsolete */
+        if (!(c->lpt_drty_flgs & LTAB_DIRTY)) {
+                c->lpt_drty_flgs |= LTAB_DIRTY;
+                ubifs_add_lpt_dirt(c, c->ltab_lnum, c->ltab_sz);
+        }
+        return 0;
+}
+/**
+ * make_lsave_dirty - make lsave node dirty.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number where lsave was written
+ * @offs: offset where lsave was written
+ *
+ * This function is used by LPT garbage collection.  LPT garbage collection is
+ * used only for the "big" LPT model (c->big_lpt == 1).  Garbage collection
+ * simply involves marking all the nodes in the LEB being garbage-collected as
+ * dirty.  The dirty nodes are written next commit, after which the LEB is free
+ * to be reused.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int make_lsave_dirty(struct ubifs_info *c, int lnum, int offs)
+{
+        if (lnum != c->lsave_lnum || offs != c->lsave_offs)
+                return 0; /* This lsave node is obsolete */
+        if (!(c->lpt_drty_flgs & LSAVE_DIRTY)) {
+                c->lpt_drty_flgs |= LSAVE_DIRTY;
+                ubifs_add_lpt_dirt(c, c->lsave_lnum, c->lsave_sz);
+        }
+        return 0;
+}
+/**
+ * make_node_dirty - make node dirty.
+ * @c: UBIFS file-system description object
+ * @node_type: LPT node type
+ * @node_num: node number
+ * @lnum: LEB number where node was written
+ * @offs: offset where node was written
+ *
+ * This function is used by LPT garbage collection.  LPT garbage collection is
+ * used only for the "big" LPT model (c->big_lpt == 1).  Garbage collection
+ * simply involves marking all the nodes in the LEB being garbage-collected as
+ * dirty.  The dirty nodes are written next commit, after which the LEB is free
+ * to be reused.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int make_node_dirty(struct ubifs_info *c, int node_type, int node_num,
+                           int lnum, int offs)
+{
+        switch (node_type) {
+        case UBIFS_LPT_NNODE:
+                return make_nnode_dirty(c, node_num, lnum, offs);
+        case UBIFS_LPT_PNODE:
+                return make_pnode_dirty(c, node_num, lnum, offs);
+        case UBIFS_LPT_LTAB:
+                return make_ltab_dirty(c, lnum, offs);
+        case UBIFS_LPT_LSAVE:
+                return make_lsave_dirty(c, lnum, offs);
+        }
+        return -EINVAL;
+}
+/**
+ * get_lpt_node_len - return the length of a node based on its type.
+ * @c: UBIFS file-system description object
+ * @node_type: LPT node type
+ */
+static int get_lpt_node_len(struct ubifs_info *c, int node_type)
+{
+        switch (node_type) {
+        case UBIFS_LPT_NNODE:
+                return c->nnode_sz;
+        case UBIFS_LPT_PNODE:
+                return c->pnode_sz;
+        case UBIFS_LPT_LTAB:
+                return c->ltab_sz;
+        case UBIFS_LPT_LSAVE:
+                return c->lsave_sz;
+        }
+        return 0;
+}
+/**
+ * get_pad_len - return the length of padding in a buffer.
+ * @c: UBIFS file-system description object
+ * @buf: buffer
+ * @len: length of buffer
+ */
+static int get_pad_len(struct ubifs_info *c, uint8_t *buf, int len)
+{
+        int offs, pad_len;
+        if (c->min_io_size == 1)
+                return 0;
+        offs = c->leb_size - len;
+        pad_len = ALIGN(offs, c->min_io_size) - offs;
+        return pad_len;
+}
+/**
+ * get_lpt_node_type - return type (and node number) of a node in a buffer.
+ * @c: UBIFS file-system description object
+ * @buf: buffer
+ * @node_num: node number is returned here
+ */
+static int get_lpt_node_type(struct ubifs_info *c, uint8_t *buf, int *node_num)
+{
+        uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+        int pos = 0, node_type;
+        node_type = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_TYPE_BITS);
+        *node_num = ubifs_unpack_bits(&addr, &pos, c->pcnt_bits);
+        return node_type;
+}
+/**
+ * is_a_node - determine if a buffer contains a node.
+ * @c: UBIFS file-system description object
+ * @buf: buffer
+ * @len: length of buffer
+ *
+ * This function returns %1 if the buffer contains a node or %0 if it does not.
+ */
+static int is_a_node(struct ubifs_info *c, uint8_t *buf, int len)
+{
+        uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+        int pos = 0, node_type, node_len;
+        uint16_t crc, calc_crc;
+        node_type = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_TYPE_BITS);
+        if (node_type == UBIFS_LPT_NOT_A_NODE)
+                return 0;
+        node_len = get_lpt_node_len(c, node_type);
+        if (!node_len || node_len > len)
+                return 0;
+        pos = 0;
+        addr = buf;
+        crc = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_CRC_BITS);
+        calc_crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
+                         node_len - UBIFS_LPT_CRC_BYTES);
+        if (crc != calc_crc)
+                return 0;
+        return 1;
+}
+/**
+ * lpt_gc_lnum - garbage collect a LPT LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to garbage collect
+ *
+ * LPT garbage collection is used only for the "big" LPT model
+ * (c->big_lpt == 1).  Garbage collection simply involves marking all the nodes
+ * in the LEB being garbage-collected as dirty.  The dirty nodes are written
+ * next commit, after which the LEB is free to be reused.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int lpt_gc_lnum(struct ubifs_info *c, int lnum)
+{
+        int err, len = c->leb_size, node_type, node_num, node_len, offs;
+        void *buf = c->lpt_buf;
+        dbg_lp("LEB %d", lnum);
+        err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size);
+        if (err) {
+                ubifs_err("cannot read LEB %d, error %d", lnum, err);
+                return err;
+        }
+        while (1) {
+                if (!is_a_node(c, buf, len)) {
+                        int pad_len;
+                        pad_len = get_pad_len(c, buf, len);
+                        if (pad_len) {
+                                buf += pad_len;
+                                len -= pad_len;
+                                continue;
+                        }
+                        return 0;
+                }
+                node_type = get_lpt_node_type(c, buf, &node_num);
+                node_len = get_lpt_node_len(c, node_type);
+                offs = c->leb_size - len;
+                ubifs_assert(node_len != 0);
+                mutex_lock(&c->lp_mutex);
+                err = make_node_dirty(c, node_type, node_num, lnum, offs);
+                mutex_unlock(&c->lp_mutex);
+                if (err)
+                        return err;
+                buf += node_len;
+                len -= node_len;
+        }
+        return 0;
+}
+/**
+ * lpt_gc - LPT garbage collection.
+ * @c: UBIFS file-system description object
+ *
+ * Select a LPT LEB for LPT garbage collection and call 'lpt_gc_lnum()'.
+ * Returns %0 on success and a negative error code on failure.
+ */
+static int lpt_gc(struct ubifs_info *c)
+{
+        int i, lnum = -1, dirty = 0;
+        mutex_lock(&c->lp_mutex);
+        for (i = 0; i < c->lpt_lebs; i++) {
+                ubifs_assert(!c->ltab[i].tgc);
+                if (i + c->lpt_first == c->nhead_lnum ||
+                    c->ltab[i].free + c->ltab[i].dirty == c->leb_size)
+                        continue;
+                if (c->ltab[i].dirty > dirty) {
+                        dirty = c->ltab[i].dirty;
+                        lnum = i + c->lpt_first;
+                }
+        }
+        mutex_unlock(&c->lp_mutex);
+        if (lnum == -1)
+                return -ENOSPC;
+        return lpt_gc_lnum(c, lnum);
+}
+/**
+ * ubifs_lpt_start_commit - UBIFS commit starts.
+ * @c: the UBIFS file-system description object
+ *
+ * This function has to be called when UBIFS starts the commit operation.
+ * This function "freezes" all currently dirty LEB properties and does not
+ * change them anymore. Further changes are saved and tracked separately
+ * because they are not part of this commit. This function returns zero in case
+ * of success and a negative error code in case of failure.
+ */
+int ubifs_lpt_start_commit(struct ubifs_info *c)
+{
+        int err, cnt;
+        dbg_lp("");
+        mutex_lock(&c->lp_mutex);
+        err = dbg_check_ltab(c);
+        if (err)
+                goto out;
+        if (c->check_lpt_free) {
+                /*
+                 * We ensure there is enough free space in
+                 * ubifs_lpt_post_commit() by marking nodes dirty. That
+                 * information is lost when we unmount, so we also need
+                 * to check free space once after mounting also.
+                 */
+                c->check_lpt_free = 0;
+                while (need_write_all(c)) {
+                        mutex_unlock(&c->lp_mutex);
+                        err = lpt_gc(c);
+                        if (err)
+                                return err;
+                        mutex_lock(&c->lp_mutex);
+                }
+        }
+        lpt_tgc_start(c);
+        if (!c->dirty_pn_cnt) {
+                dbg_cmt("no cnodes to commit");
+                err = 0;
+                goto out;
+        }
+        if (!c->big_lpt && need_write_all(c)) {
+                /* If needed, write everything */
+                err = make_tree_dirty(c);
+                if (err)
+                        goto out;
+                lpt_tgc_start(c);
+        }
+        if (c->big_lpt)
+                populate_lsave(c);
+        cnt = get_cnodes_to_commit(c);
+        ubifs_assert(cnt != 0);
+        err = layout_cnodes(c);
+        if (err)
+                goto out;
+        /* Copy the LPT's own lprops for end commit to write */
+        memcpy(c->ltab_cmt, c->ltab,
+               sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs);
+        c->lpt_drty_flgs &= ~(LTAB_DIRTY | LSAVE_DIRTY);
+out:
+        mutex_unlock(&c->lp_mutex);
+        return err;
+}
+/**
+ * free_obsolete_cnodes - free obsolete cnodes for commit end.
+ * @c: UBIFS file-system description object
+ */
+static void free_obsolete_cnodes(struct ubifs_info *c)
+{
+        struct ubifs_cnode *cnode, *cnext;
+        cnext = c->lpt_cnext;
+        if (!cnext)
+                return;
+        do {
+                cnode = cnext;
+                cnext = cnode->cnext;
+                if (test_bit(OBSOLETE_CNODE, &cnode->flags))
+                        kfree(cnode);
+                else
+                        cnode->cnext = NULL;
+        } while (cnext != c->lpt_cnext);
+        c->lpt_cnext = NULL;
+}
+/**
+ * ubifs_lpt_end_commit - finish the commit operation.
+ * @c: the UBIFS file-system description object
+ *
+ * This function has to be called when the commit operation finishes. It
+ * flushes the changes which were "frozen" by 'ubifs_lprops_start_commit()' to
+ * the media. Returns zero in case of success and a negative error code in case
+ * of failure.
+ */
+int ubifs_lpt_end_commit(struct ubifs_info *c)
+{
+        int err;
+        dbg_lp("");
+        if (!c->lpt_cnext)
+                return 0;
+        err = write_cnodes(c);
+        if (err)
+                return err;
+        mutex_lock(&c->lp_mutex);
+        free_obsolete_cnodes(c);
+        mutex_unlock(&c->lp_mutex);
+        return 0;
+}
+/**
+ * ubifs_lpt_post_commit - post commit LPT trivial GC and LPT GC.
+ * @c: UBIFS file-system description object
+ *
+ * LPT trivial GC is completed after a commit. Also LPT GC is done after a
+ * commit for the "big" LPT model.
+ */
+int ubifs_lpt_post_commit(struct ubifs_info *c)
+{
+        int err;
+        mutex_lock(&c->lp_mutex);
+        err = lpt_tgc_end(c);
+        if (err)
+                goto out;
+        if (c->big_lpt)
+                while (need_write_all(c)) {
+                        mutex_unlock(&c->lp_mutex);
+                        err = lpt_gc(c);
+                        if (err)
+                                return err;
+                        mutex_lock(&c->lp_mutex);
+                }
+out:
+        mutex_unlock(&c->lp_mutex);
+        return err;
+}
+/**
+ * first_nnode - find the first nnode in memory.
+ * @c: UBIFS file-system description object
+ * @hght: height of tree where nnode found is returned here
+ *
+ * This function returns a pointer to the nnode found or %NULL if no nnode is
+ * found. This function is a helper to 'ubifs_lpt_free()'.
+ */
+static struct ubifs_nnode *first_nnode(struct ubifs_info *c, int *hght)
+{
+        struct ubifs_nnode *nnode;
+        int h, i, found;
+        nnode = c->nroot;
+        *hght = 0;
+        if (!nnode)
+                return NULL;
+        for (h = 1; h < c->lpt_hght; h++) {
+                found = 0;
+                for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+                        if (nnode->nbranch[i].nnode) {
+                                found = 1;
+                                nnode = nnode->nbranch[i].nnode;
+                                *hght = h;
+                                break;
+                        }
+                }
+                if (!found)
+                        break;
+        }
+        return nnode;
+}
+/**
+ * next_nnode - find the next nnode in memory.
+ * @c: UBIFS file-system description object
+ * @nnode: nnode from which to start.
+ * @hght: height of tree where nnode is, is passed and returned here
+ *
+ * This function returns a pointer to the nnode found or %NULL if no nnode is
+ * found. This function is a helper to 'ubifs_lpt_free()'.
+ */
+static struct ubifs_nnode *next_nnode(struct ubifs_info *c,
+                                      struct ubifs_nnode *nnode, int *hght)
+{
+        struct ubifs_nnode *parent;
+        int iip, h, i, found;
+        parent = nnode->parent;
+        if (!parent)
+                return NULL;
+        if (nnode->iip == UBIFS_LPT_FANOUT - 1) {
+                *hght -= 1;
+                return parent;
+        }
+        for (iip = nnode->iip + 1; iip < UBIFS_LPT_FANOUT; iip++) {
+                nnode = parent->nbranch[iip].nnode;
+                if (nnode)
+                        break;
+        }
+        if (!nnode) {
+                *hght -= 1;
+                return parent;
+        }
+        for (h = *hght + 1; h < c->lpt_hght; h++) {
+                found = 0;
+                for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+                        if (nnode->nbranch[i].nnode) {
+                                found = 1;
+                                nnode = nnode->nbranch[i].nnode;
+                                *hght = h;
+                                break;
+                        }
+                }
+                if (!found)
+                        break;
+        }
+        return nnode;
+}
+/**
+ * ubifs_lpt_free - free resources owned by the LPT.
+ * @c: UBIFS file-system description object
+ * @wr_only: free only resources used for writing
+ */
+void ubifs_lpt_free(struct ubifs_info *c, int wr_only)
+{
+        struct ubifs_nnode *nnode;
+        int i, hght;
+        /* Free write-only things first */
+        free_obsolete_cnodes(c); /* Leftover from a failed commit */
+        vfree(c->ltab_cmt);
+        c->ltab_cmt = NULL;
+        vfree(c->lpt_buf);
+        c->lpt_buf = NULL;
+        kfree(c->lsave);
+        c->lsave = NULL;
+        if (wr_only)
+                return;
+        /* Now free the rest */
+        nnode = first_nnode(c, &hght);
+        while (nnode) {
+                for (i = 0; i < UBIFS_LPT_FANOUT; i++)
+                        kfree(nnode->nbranch[i].nnode);
+                nnode = next_nnode(c, nnode, &hght);
+        }
+        for (i = 0; i < LPROPS_HEAP_CNT; i++)
+                kfree(c->lpt_heap[i].arr);
+        kfree(c->dirty_idx.arr);
+        kfree(c->nroot);
+        vfree(c->ltab);
+        kfree(c->lpt_nod_buf);
+}
+#ifdef CONFIG_UBIFS_FS_DEBUG
+/**
+ * dbg_is_all_ff - determine if a buffer contains only 0xff bytes.
+ * @buf: buffer
+ * @len: buffer length
+ */
+static int dbg_is_all_ff(uint8_t *buf, int len)
+{
+        int i;
+        for (i = 0; i < len; i++)
+                if (buf[i] != 0xff)
+                        return 0;
+        return 1;
+}
+/**
+ * dbg_is_nnode_dirty - determine if a nnode is dirty.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB number where nnode was written
+ * @offs: offset where nnode was written
+ */
+static int dbg_is_nnode_dirty(struct ubifs_info *c, int lnum, int offs)
+{
+        struct ubifs_nnode *nnode;
+        int hght;
+        /* Entire tree is in memory so first_nnode / next_nnode are ok */
+        nnode = first_nnode(c, &hght);
+        for (; nnode; nnode = next_nnode(c, nnode, &hght)) {
+                struct ubifs_nbranch *branch;
+                cond_resched();
+                if (nnode->parent) {
+                        branch = &nnode->parent->nbranch[nnode->iip];
+                        if (branch->lnum != lnum || branch->offs != offs)
+                                continue;
+                        if (test_bit(DIRTY_CNODE, &nnode->flags))
+                                return 1;
+                        return 0;
+                } else {
+                        if (c->lpt_lnum != lnum || c->lpt_offs != offs)
+                                continue;
+                        if (test_bit(DIRTY_CNODE, &nnode->flags))
+                                return 1;
+                        return 0;
+                }
+        }
+        return 1;
+}
+/**
+ * dbg_is_pnode_dirty - determine if a pnode is dirty.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB number where pnode was written
+ * @offs: offset where pnode was written
+ */
+static int dbg_is_pnode_dirty(struct ubifs_info *c, int lnum, int offs)
+{
+        int i, cnt;
+        cnt = DIV_ROUND_UP(c->main_lebs, UBIFS_LPT_FANOUT);
+        for (i = 0; i < cnt; i++) {
+                struct ubifs_pnode *pnode;
+                struct ubifs_nbranch *branch;
+                cond_resched();
+                pnode = pnode_lookup(c, i);
+                if (IS_ERR(pnode))
+                        return PTR_ERR(pnode);
+                branch = &pnode->parent->nbranch[pnode->iip];
+                if (branch->lnum != lnum || branch->offs != offs)
+                        continue;
+                if (test_bit(DIRTY_CNODE, &pnode->flags))
+                        return 1;
+                return 0;
+        }
+        return 1;
+}
+/**
+ * dbg_is_ltab_dirty - determine if a ltab node is dirty.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB number where ltab node was written
+ * @offs: offset where ltab node was written
+ */
+static int dbg_is_ltab_dirty(struct ubifs_info *c, int lnum, int offs)
+{
+        if (lnum != c->ltab_lnum || offs != c->ltab_offs)
+                return 1;
+        return (c->lpt_drty_flgs & LTAB_DIRTY) != 0;
+}
+/**
+ * dbg_is_lsave_dirty - determine if a lsave node is dirty.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB number where lsave node was written
+ * @offs: offset where lsave node was written
+ */
+static int dbg_is_lsave_dirty(struct ubifs_info *c, int lnum, int offs)
+{
+        if (lnum != c->lsave_lnum || offs != c->lsave_offs)
+                return 1;
+        return (c->lpt_drty_flgs & LSAVE_DIRTY) != 0;
+}
+/**
+ * dbg_is_node_dirty - determine if a node is dirty.
+ * @c: the UBIFS file-system description object
+ * @node_type: node type
+ * @lnum: LEB number where node was written
+ * @offs: offset where node was written
+ */
+static int dbg_is_node_dirty(struct ubifs_info *c, int node_type, int lnum,
+                             int offs)
+{
+        switch (node_type) {
+        case UBIFS_LPT_NNODE:
+                return dbg_is_nnode_dirty(c, lnum, offs);
+        case UBIFS_LPT_PNODE:
+                return dbg_is_pnode_dirty(c, lnum, offs);
+        case UBIFS_LPT_LTAB:
+                return dbg_is_ltab_dirty(c, lnum, offs);
+        case UBIFS_LPT_LSAVE:
+                return dbg_is_lsave_dirty(c, lnum, offs);
+        }
+        return 1;
+}
+/**
+ * dbg_check_ltab_lnum - check the ltab for a LPT LEB number.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB number where node was written
+ * @offs: offset where node was written
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum)
+{
+        int err, len = c->leb_size, dirty = 0, node_type, node_num, node_len;
+        int ret;
+        void *buf = c->dbg_buf;
+        dbg_lp("LEB %d", lnum);
+        err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size);
+        if (err) {
+                dbg_msg("ubi_read failed, LEB %d, error %d", lnum, err);
+                return err;
+        }
+        while (1) {
+                if (!is_a_node(c, buf, len)) {
+                        int i, pad_len;
+                        pad_len = get_pad_len(c, buf, len);
+                        if (pad_len) {
+                                buf += pad_len;
+                                len -= pad_len;
+                                dirty += pad_len;
+                                continue;
+                        }
+                        if (!dbg_is_all_ff(buf, len)) {
+                                dbg_msg("invalid empty space in LEB %d at %d",
+                                        lnum, c->leb_size - len);
+                                err = -EINVAL;
+                        }
+                        i = lnum - c->lpt_first;
+                        if (len != c->ltab[i].free) {
+                                dbg_msg("invalid free space in LEB %d "
+                                        "(free %d, expected %d)",
+                                        lnum, len, c->ltab[i].free);
+                                err = -EINVAL;
+                        }
+                        if (dirty != c->ltab[i].dirty) {
+                                dbg_msg("invalid dirty space in LEB %d "
+                                        "(dirty %d, expected %d)",
+                                        lnum, dirty, c->ltab[i].dirty);
+                                err = -EINVAL;
+                        }
+                        return err;
+                }
+                node_type = get_lpt_node_type(c, buf, &node_num);
+                node_len = get_lpt_node_len(c, node_type);
+                ret = dbg_is_node_dirty(c, node_type, lnum, c->leb_size - len);
+                if (ret == 1)
+                        dirty += node_len;
+                buf += node_len;
+                len -= node_len;
+        }
+}
+/**
+ * dbg_check_ltab - check the free and dirty space in the ltab.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int dbg_check_ltab(struct ubifs_info *c)
+{
+        int lnum, err, i, cnt;
+        if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
+                return 0;
+        /* Bring the entire tree into memory */
+        cnt = DIV_ROUND_UP(c->main_lebs, UBIFS_LPT_FANOUT);
+        for (i = 0; i < cnt; i++) {
+                struct ubifs_pnode *pnode;
+                pnode = pnode_lookup(c, i);
+                if (IS_ERR(pnode))
+                        return PTR_ERR(pnode);
+                cond_resched();
+        }
+        /* Check nodes */
+        err = dbg_check_lpt_nodes(c, (struct ubifs_cnode *)c->nroot, 0, 0);
+        if (err)
+                return err;
+        /* Check each LEB */
+        for (lnum = c->lpt_first; lnum <= c->lpt_last; lnum++) {
+                err = dbg_check_ltab_lnum(c, lnum);
+                if (err) {
+                        dbg_err("failed at LEB %d", lnum);
+                        return err;
+                }
+        }
+        dbg_lp("succeeded");
+        return 0;
+}
+#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
new file mode 100644
index 000000000000..71d5493bf565
--- /dev/null
+++ b/fs/ubifs/master.c
@@ -0,0 +1,387 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+/* This file implements reading and writing the master node */
+#include "ubifs.h"
+/**
+ * scan_for_master - search the valid master node.
+ * @c: UBIFS file-system description object
+ *
+ * This function scans the master node LEBs and search for the latest master
+ * node. Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+static int scan_for_master(struct ubifs_info *c)
+{
+        struct ubifs_scan_leb *sleb;
+        struct ubifs_scan_node *snod;
+        int lnum, offs = 0, nodes_cnt;
+        lnum = UBIFS_MST_LNUM;
+        sleb = ubifs_scan(c, lnum, 0, c->sbuf);
+        if (IS_ERR(sleb))
+                return PTR_ERR(sleb);
+        nodes_cnt = sleb->nodes_cnt;
+        if (nodes_cnt > 0) {
+                snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node,
+                                  list);
+                if (snod->type != UBIFS_MST_NODE)
+                        goto out;
+                memcpy(c->mst_node, snod->node, snod->len);
+                offs = snod->offs;
+        }
+        ubifs_scan_destroy(sleb);
+        lnum += 1;
+        sleb = ubifs_scan(c, lnum, 0, c->sbuf);
+        if (IS_ERR(sleb))
+                return PTR_ERR(sleb);
+        if (sleb->nodes_cnt != nodes_cnt)
+                goto out;
+        if (!sleb->nodes_cnt)
+                goto out;
+        snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, list);
+        if (snod->type != UBIFS_MST_NODE)
+                goto out;
+        if (snod->offs != offs)
+                goto out;
+        if (memcmp((void *)c->mst_node + UBIFS_CH_SZ,
+                   (void *)snod->node + UBIFS_CH_SZ,
+                   UBIFS_MST_NODE_SZ - UBIFS_CH_SZ))
+                goto out;
+        c->mst_offs = offs;
+        ubifs_scan_destroy(sleb);
+        return 0;
+out:
+        ubifs_scan_destroy(sleb);
+        return -EINVAL;
+}
+/**
+ * validate_master - validate master node.
+ * @c: UBIFS file-system description object
+ *
+ * This function validates data which was read from master node. Returns zero
+ * if the data is all right and %-EINVAL if not.
+ */
+static int validate_master(const struct ubifs_info *c)
+{
+        long long main_sz;
+        int err;
+        if (c->max_sqnum >= SQNUM_WATERMARK) {
+                err = 1;
+                goto out;
+        }
+        if (c->cmt_no >= c->max_sqnum) {
+                err = 2;
+                goto out;
+        }
+        if (c->highest_inum >= INUM_WATERMARK) {
+                err = 3;
+                goto out;
+        }
+        if (c->lhead_lnum < UBIFS_LOG_LNUM ||
+            c->lhead_lnum >= UBIFS_LOG_LNUM + c->log_lebs ||
+            c->lhead_offs < 0 || c->lhead_offs >= c->leb_size ||
+            c->lhead_offs & (c->min_io_size - 1)) {
+                err = 4;
+                goto out;
+        }
+        if (c->zroot.lnum >= c->leb_cnt || c->zroot.lnum < c->main_first ||
+            c->zroot.offs >= c->leb_size || c->zroot.offs & 7) {
+                err = 5;
+                goto out;
+        }
+        if (c->zroot.len < c->ranges[UBIFS_IDX_NODE].min_len ||
+            c->zroot.len > c->ranges[UBIFS_IDX_NODE].max_len) {
+                err = 6;
+                goto out;
+        }
+        if (c->gc_lnum >= c->leb_cnt || c->gc_lnum < c->main_first) {
+                err = 7;
+                goto out;
+        }
+        if (c->ihead_lnum >= c->leb_cnt || c->ihead_lnum < c->main_first ||
+            c->ihead_offs % c->min_io_size || c->ihead_offs < 0 ||
+            c->ihead_offs > c->leb_size || c->ihead_offs & 7) {
+                err = 8;
+                goto out;
+        }
+        main_sz = (long long)c->main_lebs * c->leb_size;
+        if (c->old_idx_sz & 7 || c->old_idx_sz >= main_sz) {
+                err = 9;
+                goto out;
+        }
+        if (c->lpt_lnum < c->lpt_first || c->lpt_lnum > c->lpt_last ||
+            c->lpt_offs < 0 || c->lpt_offs + c->nnode_sz > c->leb_size) {
+                err = 10;
+                goto out;
+        }
+        if (c->nhead_lnum < c->lpt_first || c->nhead_lnum > c->lpt_last ||
+            c->nhead_offs < 0 || c->nhead_offs % c->min_io_size ||
+            c->nhead_offs > c->leb_size) {
+                err = 11;
+                goto out;
+        }
+        if (c->ltab_lnum < c->lpt_first || c->ltab_lnum > c->lpt_last ||
+            c->ltab_offs < 0 ||
+            c->ltab_offs + c->ltab_sz > c->leb_size) {
+                err = 12;
+                goto out;
+        }
+        if (c->big_lpt && (c->lsave_lnum < c->lpt_first ||
+            c->lsave_lnum > c->lpt_last || c->lsave_offs < 0 ||
+            c->lsave_offs + c->lsave_sz > c->leb_size)) {
+                err = 13;
+                goto out;
+        }
+        if (c->lscan_lnum < c->main_first || c->lscan_lnum >= c->leb_cnt) {
+                err = 14;
+                goto out;
+        }
+        if (c->lst.empty_lebs < 0 || c->lst.empty_lebs > c->main_lebs - 2) {
+                err = 15;
+                goto out;
+        }
+        if (c->lst.idx_lebs < 0 || c->lst.idx_lebs > c->main_lebs - 1) {
+                err = 16;
+                goto out;
+        }
+        if (c->lst.total_free < 0 || c->lst.total_free > main_sz ||
+            c->lst.total_free & 7) {
+                err = 17;
+                goto out;
+        }
+        if (c->lst.total_dirty < 0 || (c->lst.total_dirty & 7)) {
+                err = 18;
+                goto out;
+        }
+        if (c->lst.total_used < 0 || (c->lst.total_used & 7)) {
+                err = 19;
+                goto out;
+        }
+        if (c->lst.total_free + c->lst.total_dirty +
+            c->lst.total_used > main_sz) {
+                err = 20;
+                goto out;
+        }
+        if (c->lst.total_dead + c->lst.total_dark +
+            c->lst.total_used + c->old_idx_sz > main_sz) {
+                err = 21;
+                goto out;
+        }
+        if (c->lst.total_dead < 0 ||
+            c->lst.total_dead > c->lst.total_free + c->lst.total_dirty ||
+            c->lst.total_dead & 7) {
+                err = 22;
+                goto out;
+        }
+        if (c->lst.total_dark < 0 ||
+            c->lst.total_dark > c->lst.total_free + c->lst.total_dirty ||
+            c->lst.total_dark & 7) {
+                err = 23;
+                goto out;
+        }
+        return 0;
+out:
+        ubifs_err("bad master node at offset %d error %d", c->mst_offs, err);
+        dbg_dump_node(c, c->mst_node);
+        return -EINVAL;
+}
+/**
+ * ubifs_read_master - read master node.
+ * @c: UBIFS file-system description object
+ *
+ * This function finds and reads the master node during file-system mount. If
+ * the flash is empty, it creates default master node as well. Returns zero in
+ * case of success and a negative error code in case of failure.
+ */
+int ubifs_read_master(struct ubifs_info *c)
+{
+        int err, old_leb_cnt;
+        c->mst_node = kzalloc(c->mst_node_alsz, GFP_KERNEL);
+        if (!c->mst_node)
+                return -ENOMEM;
+        err = scan_for_master(c);
+        if (err) {
+                err = ubifs_recover_master_node(c);
+                if (err)
+                        /*
+                         * Note, we do not free 'c->mst_node' here because the
+                         * unmount routine will take care of this.
+                         */
+                        return err;
+        }
+        /* Make sure that the recovery flag is clear */
+        c->mst_node->flags &= cpu_to_le32(~UBIFS_MST_RCVRY);
+        c->max_sqnum       = le64_to_cpu(c->mst_node->ch.sqnum);
+        c->highest_inum    = le64_to_cpu(c->mst_node->highest_inum);
+        c->cmt_no          = le64_to_cpu(c->mst_node->cmt_no);
+        c->zroot.lnum      = le32_to_cpu(c->mst_node->root_lnum);
+        c->zroot.offs      = le32_to_cpu(c->mst_node->root_offs);
+        c->zroot.len       = le32_to_cpu(c->mst_node->root_len);
+        c->lhead_lnum      = le32_to_cpu(c->mst_node->log_lnum);
+        c->gc_lnum         = le32_to_cpu(c->mst_node->gc_lnum);
+        c->ihead_lnum      = le32_to_cpu(c->mst_node->ihead_lnum);
+        c->ihead_offs      = le32_to_cpu(c->mst_node->ihead_offs);
+        c->old_idx_sz      = le64_to_cpu(c->mst_node->index_size);
+        c->lpt_lnum        = le32_to_cpu(c->mst_node->lpt_lnum);
+        c->lpt_offs        = le32_to_cpu(c->mst_node->lpt_offs);
+        c->nhead_lnum      = le32_to_cpu(c->mst_node->nhead_lnum);
+        c->nhead_offs      = le32_to_cpu(c->mst_node->nhead_offs);
+        c->ltab_lnum       = le32_to_cpu(c->mst_node->ltab_lnum);
+        c->ltab_offs       = le32_to_cpu(c->mst_node->ltab_offs);
+        c->lsave_lnum      = le32_to_cpu(c->mst_node->lsave_lnum);
+        c->lsave_offs      = le32_to_cpu(c->mst_node->lsave_offs);
+        c->lscan_lnum      = le32_to_cpu(c->mst_node->lscan_lnum);
+        c->lst.empty_lebs  = le32_to_cpu(c->mst_node->empty_lebs);
+        c->lst.idx_lebs    = le32_to_cpu(c->mst_node->idx_lebs);
+        old_leb_cnt        = le32_to_cpu(c->mst_node->leb_cnt);
+        c->lst.total_free  = le64_to_cpu(c->mst_node->total_free);
+        c->lst.total_dirty = le64_to_cpu(c->mst_node->total_dirty);
+        c->lst.total_used  = le64_to_cpu(c->mst_node->total_used);
+        c->lst.total_dead  = le64_to_cpu(c->mst_node->total_dead);
+        c->lst.total_dark  = le64_to_cpu(c->mst_node->total_dark);
+        c->calc_idx_sz = c->old_idx_sz;
+        if (c->mst_node->flags & cpu_to_le32(UBIFS_MST_NO_ORPHS))
+                c->no_orphs = 1;
+        if (old_leb_cnt != c->leb_cnt) {
+                /* The file system has been resized */
+                int growth = c->leb_cnt - old_leb_cnt;
+                if (c->leb_cnt < old_leb_cnt ||
+                    c->leb_cnt < UBIFS_MIN_LEB_CNT) {
+                        ubifs_err("bad leb_cnt on master node");
+                        dbg_dump_node(c, c->mst_node);
+                        return -EINVAL;
+                }
+                dbg_mnt("Auto resizing (master) from %d LEBs to %d LEBs",
+                        old_leb_cnt, c->leb_cnt);
+                c->lst.empty_lebs += growth;
+                c->lst.total_free += growth * (long long)c->leb_size;
+                c->lst.total_dark += growth * (long long)c->dark_wm;
+                /*
+                 * Reflect changes back onto the master node. N.B. the master
+                 * node gets written immediately whenever mounting (or
+                 * remounting) in read-write mode, so we do not need to write it
+                 * here.
+                 */
+                c->mst_node->leb_cnt = cpu_to_le32(c->leb_cnt);
+                c->mst_node->empty_lebs = cpu_to_le32(c->lst.empty_lebs);
+                c->mst_node->total_free = cpu_to_le64(c->lst.total_free);
+                c->mst_node->total_dark = cpu_to_le64(c->lst.total_dark);
+        }
+        err = validate_master(c);
+        if (err)
+                return err;
+        err = dbg_old_index_check_init(c, &c->zroot);
+        return err;
+}
+/**
+ * ubifs_write_master - write master node.
+ * @c: UBIFS file-system description object
+ *
+ * This function writes the master node. The caller has to take the
+ * @c->mst_mutex lock before calling this function. Returns zero in case of
+ * success and a negative error code in case of failure. The master node is
+ * written twice to enable recovery.
+ */
+int ubifs_write_master(struct ubifs_info *c)
+{
+        int err, lnum, offs, len;
+        if (c->ro_media)
+                return -EINVAL;
+        lnum = UBIFS_MST_LNUM;
+        offs = c->mst_offs + c->mst_node_alsz;
+        len = UBIFS_MST_NODE_SZ;
+        if (offs + UBIFS_MST_NODE_SZ > c->leb_size) {
+                err = ubifs_leb_unmap(c, lnum);
+                if (err)
+                        return err;
+                offs = 0;
+        }
+        c->mst_offs = offs;
+        c->mst_node->highest_inum = cpu_to_le64(c->highest_inum);
+        err = ubifs_write_node(c, c->mst_node, len, lnum, offs, UBI_SHORTTERM);
+        if (err)
+                return err;
+        lnum += 1;
+        if (offs == 0) {
+                err = ubifs_leb_unmap(c, lnum);
+                if (err)
+                        return err;
+        }
+        err = ubifs_write_node(c, c->mst_node, len, lnum, offs, UBI_SHORTTERM);
+        return err;
+}
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
new file mode 100644
index 000000000000..4beccfc256d2
--- /dev/null
+++ b/fs/ubifs/misc.h
@@ -0,0 +1,342 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+/*
+ * This file contains miscellaneous helper functions.
+ */
+#ifndef __UBIFS_MISC_H__
+#define __UBIFS_MISC_H__
+/**
+ * ubifs_zn_dirty - check if znode is dirty.
+ * @znode: znode to check
+ *
+ * This helper function returns %1 if @znode is dirty and %0 otherwise.
+ */
+static inline int ubifs_zn_dirty(const struct ubifs_znode *znode)
+{
+        return !!test_bit(DIRTY_ZNODE, &znode->flags);
+}
+/**
+ * ubifs_wake_up_bgt - wake up background thread.
+ * @c: UBIFS file-system description object
+ */
+static inline void ubifs_wake_up_bgt(struct ubifs_info *c)
+{
+        if (c->bgt && !c->need_bgt) {
+                c->need_bgt = 1;
+                wake_up_process(c->bgt);
+        }
+}
+/**
+ * ubifs_tnc_find_child - find next child in znode.
+ * @znode: znode to search at
+ * @start: the zbranch index to start at
+ *
+ * This helper function looks for znode child starting at index @start. Returns
+ * the child or %NULL if no children were found.
+ */
+static inline struct ubifs_znode *
+ubifs_tnc_find_child(struct ubifs_znode *znode, int start)
+{
+        while (start < znode->child_cnt) {
+                if (znode->zbranch[start].znode)
+                        return znode->zbranch[start].znode;
+                start += 1;
+        }
+        return NULL;
+}
+/**
+ * ubifs_inode - get UBIFS inode information by VFS 'struct inode' object.
+ * @inode: the VFS 'struct inode' pointer
+ */
+static inline struct ubifs_inode *ubifs_inode(const struct inode *inode)
+{
+        return container_of(inode, struct ubifs_inode, vfs_inode);
+}
+/**
+ * ubifs_ro_mode - switch UBIFS to read read-only mode.
+ * @c: UBIFS file-system description object
+ * @err: error code which is the reason of switching to R/O mode
+ */
+static inline void ubifs_ro_mode(struct ubifs_info *c, int err)
+{
+        if (!c->ro_media) {
+                c->ro_media = 1;
+                ubifs_warn("switched to read-only mode, error %d", err);
+                dbg_dump_stack();
+        }
+}
+/**
+ * ubifs_compr_present - check if compressor was compiled in.
+ * @compr_type: compressor type to check
+ *
+ * This function returns %1 of compressor of type @compr_type is present, and
+ * %0 if not.
+ */
+static inline int ubifs_compr_present(int compr_type)
+{
+        ubifs_assert(compr_type >= 0 && compr_type < UBIFS_COMPR_TYPES_CNT);
+        return !!ubifs_compressors[compr_type]->capi_name;
+}
+/**
+ * ubifs_compr_name - get compressor name string by its type.
+ * @compr_type: compressor type
+ *
+ * This function returns compressor type string.
+ */
+static inline const char *ubifs_compr_name(int compr_type)
+{
+        ubifs_assert(compr_type >= 0 && compr_type < UBIFS_COMPR_TYPES_CNT);
+        return ubifs_compressors[compr_type]->name;
+}
+/**
+ * ubifs_wbuf_sync - synchronize write-buffer.
+ * @wbuf: write-buffer to synchronize
+ *
+ * This is the same as as 'ubifs_wbuf_sync_nolock()' but it does not assume
+ * that the write-buffer is already locked.
+ */
+static inline int ubifs_wbuf_sync(struct ubifs_wbuf *wbuf)
+{
+        int err;
+        mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+        err = ubifs_wbuf_sync_nolock(wbuf);
+        mutex_unlock(&wbuf->io_mutex);
+        return err;
+}
+/**
+ * ubifs_leb_unmap - unmap an LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to unmap
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static inline int ubifs_leb_unmap(const struct ubifs_info *c, int lnum)
+{
+        int err;
+        if (c->ro_media)
+                return -EROFS;
+        err = ubi_leb_unmap(c->ubi, lnum);
+        if (err) {
+                ubifs_err("unmap LEB %d failed, error %d", lnum, err);
+                return err;
+        }
+        return 0;
+}
+/**
+ * ubifs_leb_write - write to a LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to write
+ * @buf: buffer to write from
+ * @offs: offset within LEB to write to
+ * @len: length to write
+ * @dtype: data type
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static inline int ubifs_leb_write(const struct ubifs_info *c, int lnum,
+                                  const void *buf, int offs, int len, int dtype)
+{
+        int err;
+        if (c->ro_media)
+                return -EROFS;
+        err = ubi_leb_write(c->ubi, lnum, buf, offs, len, dtype);
+        if (err) {
+                ubifs_err("writing %d bytes at %d:%d, error %d",
+                          len, lnum, offs, err);
+                return err;
+        }
+        return 0;
+}
+/**
+ * ubifs_leb_change - atomic LEB change.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to write
+ * @buf: buffer to write from
+ * @len: length to write
+ * @dtype: data type
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static inline int ubifs_leb_change(const struct ubifs_info *c, int lnum,
+                                   const void *buf, int len, int dtype)
+{
+        int err;
+        if (c->ro_media)
+                return -EROFS;
+        err = ubi_leb_change(c->ubi, lnum, buf, len, dtype);
+        if (err) {
+                ubifs_err("changing %d bytes in LEB %d, error %d",
+                          len, lnum, err);
+                return err;
+        }
+        return 0;
+}
+/**
+ * ubifs_encode_dev - encode device node IDs.
+ * @dev: UBIFS device node information
+ * @rdev: device IDs to encode
+ *
+ * This is a helper function which encodes major/minor numbers of a device node
+ * into UBIFS device node description. We use standard Linux "new" and "huge"
+ * encodings.
+ */
+static inline int ubifs_encode_dev(union ubifs_dev_desc *dev, dev_t rdev)
+{
+        if (new_valid_dev(rdev)) {
+                dev->new = cpu_to_le32(new_encode_dev(rdev));
+                return sizeof(dev->new);
+        } else {
+                dev->huge = cpu_to_le64(huge_encode_dev(rdev));
+                return sizeof(dev->huge);
+        }
+}
+/**
+ * ubifs_add_dirt - add dirty space to LEB properties.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB to add dirty space for
+ * @dirty: dirty space to add
+ *
+ * This is a helper function which increased amount of dirty LEB space. Returns
+ * zero in case of success and a negative error code in case of failure.
+ */
+static inline int ubifs_add_dirt(struct ubifs_info *c, int lnum, int dirty)
+{
+        return ubifs_update_one_lp(c, lnum, LPROPS_NC, dirty, 0, 0);
+}
+/**
+ * ubifs_return_leb - return LEB to lprops.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB to return
+ *
+ * This helper function cleans the "taken" flag of a logical eraseblock in the
+ * lprops. Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+static inline int ubifs_return_leb(struct ubifs_info *c, int lnum)
+{
+        return ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
+                                   LPROPS_TAKEN, 0);
+}
+/**
+ * ubifs_idx_node_sz - return index node size.
+ * @c: the UBIFS file-system description object
+ * @child_cnt: number of children of this index node
+ */
+static inline int ubifs_idx_node_sz(const struct ubifs_info *c, int child_cnt)
+{
+        return UBIFS_IDX_NODE_SZ + (UBIFS_BRANCH_SZ + c->key_len) * child_cnt;
+}
+/**
+ * ubifs_idx_branch - return pointer to an index branch.
+ * @c: the UBIFS file-system description object
+ * @idx: index node
+ * @bnum: branch number
+ */
+static inline
+struct ubifs_branch *ubifs_idx_branch(const struct ubifs_info *c,
+                                      const struct ubifs_idx_node *idx,
+                                      int bnum)
+{
+        return (struct ubifs_branch *)((void *)idx->branches +
+                                       (UBIFS_BRANCH_SZ + c->key_len) * bnum);
+}
+/**
+ * ubifs_idx_key - return pointer to an index key.
+ * @c: the UBIFS file-system description object
+ * @idx: index node
+ */
+static inline void *ubifs_idx_key(const struct ubifs_info *c,
+                                  const struct ubifs_idx_node *idx)
+{
+        return (void *)((struct ubifs_branch *)idx->branches)->key;
+}
+/**
+ * ubifs_reported_space - calculate reported free space.
+ * @c: the UBIFS file-system description object
+ * @free: amount of free space
+ *
+ * This function calculates amount of free space which will be reported to
+ * user-space. User-space application tend to expect that if the file-system
+ * (e.g., via the 'statfs()' call) reports that it has N bytes available, they
+ * are able to write a file of size N. UBIFS attaches node headers to each data
+ * node and it has to write indexind nodes as well. This introduces additional
+ * overhead, and UBIFS it has to report sligtly less free space to meet the
+ * above expectetion.
+ *
+ * This function assumes free space is made up of uncompressed data nodes and
+ * full index nodes (one per data node, doubled because we always allow enough
+ * space to write the index twice).
+ *
+ * Note, the calculation is pessimistic, which means that most of the time
+ * UBIFS reports less space than it actually has.
+ */
+static inline long long ubifs_reported_space(const struct ubifs_info *c,
+                                             uint64_t free)
+{
+        int divisor, factor;
+        divisor = UBIFS_MAX_DATA_NODE_SZ + (c->max_idx_node_sz << 1);
+        factor = UBIFS_MAX_DATA_NODE_SZ - UBIFS_DATA_NODE_SZ;
+        do_div(free, divisor);
+        return free * factor;
+}
+/**
+ * ubifs_current_time - round current time to time granularity.
+ * @inode: inode
+ */
+static inline struct timespec ubifs_current_time(struct inode *inode)
+{
+        return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ?
+                current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
+}
+#endif /* __UBIFS_MISC_H__ */
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
new file mode 100644
index 000000000000..3afeb9242c6a
--- /dev/null
+++ b/fs/ubifs/orphan.c
@@ -0,0 +1,958 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Author: Adrian Hunter
+ */
+#include "ubifs.h"
+/*
+ * An orphan is an inode number whose inode node has been committed to the index
+ * with a link count of zero. That happens when an open file is deleted
+ * (unlinked) and then a commit is run. In the normal course of events the inode
+ * would be deleted when the file is closed. However in the case of an unclean
+ * unmount, orphans need to be accounted for. After an unclean unmount, the
+ * orphans' inodes must be deleted which means either scanning the entire index
+ * looking for them, or keeping a list on flash somewhere. This unit implements
+ * the latter approach.
+ *
+ * The orphan area is a fixed number of LEBs situated between the LPT area and
+ * the main area. The number of orphan area LEBs is specified when the file
+ * system is created. The minimum number is 1. The size of the orphan area
+ * should be so that it can hold the maximum number of orphans that are expected
+ * to ever exist at one time.
+ *
+ * The number of orphans that can fit in a LEB is:
+ *
+ *         (c->leb_size - UBIFS_ORPH_NODE_SZ) / sizeof(__le64)
+ *
+ * For example: a 15872 byte LEB can fit 1980 orphans so 1 LEB may be enough.
+ *
+ * Orphans are accumulated in a rb-tree. When an inode's link count drops to
+ * zero, the inode number is added to the rb-tree. It is removed from the tree
+ * when the inode is deleted.  Any new orphans that are in the orphan tree when
+ * the commit is run, are written to the orphan area in 1 or more orph nodes.
+ * If the orphan area is full, it is consolidated to make space.  There is
+ * always enough space because validation prevents the user from creating more
+ * than the maximum number of orphans allowed.
+ */
+#ifdef CONFIG_UBIFS_FS_DEBUG
+static int dbg_check_orphans(struct ubifs_info *c);
+#else
+#define dbg_check_orphans(c) 0
+#endif
+/**
+ * ubifs_add_orphan - add an orphan.
+ * @c: UBIFS file-system description object
+ * @inum: orphan inode number
+ *
+ * Add an orphan. This function is called when an inodes link count drops to
+ * zero.
+ */
+int ubifs_add_orphan(struct ubifs_info *c, ino_t inum)
+{
+        struct ubifs_orphan *orphan, *o;
+        struct rb_node **p, *parent = NULL;
+        orphan = kzalloc(sizeof(struct ubifs_orphan), GFP_NOFS);
+        if (!orphan)
+                return -ENOMEM;
+        orphan->inum = inum;
+        orphan->new = 1;
+        spin_lock(&c->orphan_lock);
+        if (c->tot_orphans >= c->max_orphans) {
+                spin_unlock(&c->orphan_lock);
+                kfree(orphan);
+                return -ENFILE;
+        }
+        p = &c->orph_tree.rb_node;
+        while (*p) {
+                parent = *p;
+                o = rb_entry(parent, struct ubifs_orphan, rb);
+                if (inum < o->inum)
+                        p = &(*p)->rb_left;
+                else if (inum > o->inum)
+                        p = &(*p)->rb_right;
+                else {
+                        dbg_err("orphaned twice");
+                        spin_unlock(&c->orphan_lock);
+                        kfree(orphan);
+                        return 0;
+                }
+        }
+        c->tot_orphans += 1;
+        c->new_orphans += 1;
+        rb_link_node(&orphan->rb, parent, p);
+        rb_insert_color(&orphan->rb, &c->orph_tree);
+        list_add_tail(&orphan->list, &c->orph_list);
+        list_add_tail(&orphan->new_list, &c->orph_new);
+        spin_unlock(&c->orphan_lock);
+        dbg_gen("ino %lu", inum);
+        return 0;
+}
+/**
+ * ubifs_delete_orphan - delete an orphan.
+ * @c: UBIFS file-system description object
+ * @inum: orphan inode number
+ *
+ * Delete an orphan. This function is called when an inode is deleted.
+ */
+void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum)
+{
+        struct ubifs_orphan *o;
+        struct rb_node *p;
+        spin_lock(&c->orphan_lock);
+        p = c->orph_tree.rb_node;
+        while (p) {
+                o = rb_entry(p, struct ubifs_orphan, rb);
+                if (inum < o->inum)
+                        p = p->rb_left;
+                else if (inum > o->inum)
+                        p = p->rb_right;
+                else {
+                        if (o->dnext) {
+                                spin_unlock(&c->orphan_lock);
+                                dbg_gen("deleted twice ino %lu", inum);
+                                return;
+                        }
+                        if (o->cnext) {
+                                o->dnext = c->orph_dnext;
+                                c->orph_dnext = o;
+                                spin_unlock(&c->orphan_lock);
+                                dbg_gen("delete later ino %lu", inum);
+                                return;
+                        }
+                        rb_erase(p, &c->orph_tree);
+                        list_del(&o->list);
+                        c->tot_orphans -= 1;
+                        if (o->new) {
+                                list_del(&o->new_list);
+                                c->new_orphans -= 1;
+                        }
+                        spin_unlock(&c->orphan_lock);
+                        kfree(o);
+                        dbg_gen("inum %lu", inum);
+                        return;
+                }
+        }
+        spin_unlock(&c->orphan_lock);
+        dbg_err("missing orphan ino %lu", inum);
+        dbg_dump_stack();
+}
+/**
+ * ubifs_orphan_start_commit - start commit of orphans.
+ * @c: UBIFS file-system description object
+ *
+ * Start commit of orphans.
+ */
+int ubifs_orphan_start_commit(struct ubifs_info *c)
+{
+        struct ubifs_orphan *orphan, **last;
+        spin_lock(&c->orphan_lock);
+        last = &c->orph_cnext;
+        list_for_each_entry(orphan, &c->orph_new, new_list) {
+                ubifs_assert(orphan->new);
+                orphan->new = 0;
+                *last = orphan;
+                last = &orphan->cnext;
+        }
+        *last = orphan->cnext;
+        c->cmt_orphans = c->new_orphans;
+        c->new_orphans = 0;
+        dbg_cmt("%d orphans to commit", c->cmt_orphans);
+        INIT_LIST_HEAD(&c->orph_new);
+        if (c->tot_orphans == 0)
+                c->no_orphs = 1;
+        else
+                c->no_orphs = 0;
+        spin_unlock(&c->orphan_lock);
+        return 0;
+}
+/**
+ * avail_orphs - calculate available space.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns the number of orphans that can be written in the
+ * available space.
+ */
+static int avail_orphs(struct ubifs_info *c)
+{
+        int avail_lebs, avail, gap;
+        avail_lebs = c->orph_lebs - (c->ohead_lnum - c->orph_first) - 1;
+        avail = avail_lebs *
+               ((c->leb_size - UBIFS_ORPH_NODE_SZ) / sizeof(__le64));
+        gap = c->leb_size - c->ohead_offs;
+        if (gap >= UBIFS_ORPH_NODE_SZ + sizeof(__le64))
+                avail += (gap - UBIFS_ORPH_NODE_SZ) / sizeof(__le64);
+        return avail;
+}
+/**
+ * tot_avail_orphs - calculate total space.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns the number of orphans that can be written in half
+ * the total space. That leaves half the space for adding new orphans.
+ */
+static int tot_avail_orphs(struct ubifs_info *c)
+{
+        int avail_lebs, avail;
+        avail_lebs = c->orph_lebs;
+        avail = avail_lebs *
+               ((c->leb_size - UBIFS_ORPH_NODE_SZ) / sizeof(__le64));
+        return avail / 2;
+}
+/**
+ * do_write_orph_node - write a node
+ * @c: UBIFS file-system description object
+ * @len: length of node
+ * @atomic: write atomically
+ *
+ * This function writes a node to the orphan head from the orphan buffer. If
+ * %atomic is not zero, then the write is done atomically. On success, %0 is
+ * returned, otherwise a negative error code is returned.
+ */
+static int do_write_orph_node(struct ubifs_info *c, int len, int atomic)
+{
+        int err = 0;
+        if (atomic) {
+                ubifs_assert(c->ohead_offs == 0);
+                ubifs_prepare_node(c, c->orph_buf, len, 1);
+                len = ALIGN(len, c->min_io_size);
+                err = ubifs_leb_change(c, c->ohead_lnum, c->orph_buf, len,
+                                       UBI_SHORTTERM);
+        } else {
+                if (c->ohead_offs == 0) {
+                        /* Ensure LEB has been unmapped */
+                        err = ubifs_leb_unmap(c, c->ohead_lnum);
+                        if (err)
+                                return err;
+                }
+                err = ubifs_write_node(c, c->orph_buf, len, c->ohead_lnum,
+                                       c->ohead_offs, UBI_SHORTTERM);
+        }
+        return err;
+}
+/**
+ * write_orph_node - write an orph node
+ * @c: UBIFS file-system description object
+ * @atomic: write atomically
+ *
+ * This function builds an orph node from the cnext list and writes it to the
+ * orphan head. On success, %0 is returned, otherwise a negative error code
+ * is returned.
+ */
+static int write_orph_node(struct ubifs_info *c, int atomic)
+{
+        struct ubifs_orphan *orphan, *cnext;
+        struct ubifs_orph_node *orph;
+        int gap, err, len, cnt, i;
+        ubifs_assert(c->cmt_orphans > 0);
+        gap = c->leb_size - c->ohead_offs;
+        if (gap < UBIFS_ORPH_NODE_SZ + sizeof(__le64)) {
+                c->ohead_lnum += 1;
+                c->ohead_offs = 0;
+                gap = c->leb_size;
+                if (c->ohead_lnum > c->orph_last) {
+                        /*
+                         * We limit the number of orphans so that this should
+                         * never happen.
+                         */
+                        ubifs_err("out of space in orphan area");
+                        return -EINVAL;
+                }
+        }
+        cnt = (gap - UBIFS_ORPH_NODE_SZ) / sizeof(__le64);
+        if (cnt > c->cmt_orphans)
+                cnt = c->cmt_orphans;
+        len = UBIFS_ORPH_NODE_SZ + cnt * sizeof(__le64);
+        ubifs_assert(c->orph_buf);
+        orph = c->orph_buf;
+        orph->ch.node_type = UBIFS_ORPH_NODE;
+        spin_lock(&c->orphan_lock);
+        cnext = c->orph_cnext;
+        for (i = 0; i < cnt; i++) {
+                orphan = cnext;
+                orph->inos[i] = cpu_to_le64(orphan->inum);
+                cnext = orphan->cnext;
+                orphan->cnext = NULL;
+        }
+        c->orph_cnext = cnext;
+        c->cmt_orphans -= cnt;
+        spin_unlock(&c->orphan_lock);
+        if (c->cmt_orphans)
+                orph->cmt_no = cpu_to_le64(c->cmt_no + 1);
+        else
+                /* Mark the last node of the commit */
+                orph->cmt_no = cpu_to_le64((c->cmt_no + 1) | (1ULL << 63));
+        ubifs_assert(c->ohead_offs + len <= c->leb_size);
+        ubifs_assert(c->ohead_lnum >= c->orph_first);
+        ubifs_assert(c->ohead_lnum <= c->orph_last);
+        err = do_write_orph_node(c, len, atomic);
+        c->ohead_offs += ALIGN(len, c->min_io_size);
+        c->ohead_offs = ALIGN(c->ohead_offs, 8);
+        return err;
+}
+/**
+ * write_orph_nodes - write orph nodes until there are no more to commit
+ * @c: UBIFS file-system description object
+ * @atomic: write atomically
+ *
+ * This function writes orph nodes for all the orphans to commit. On success,
+ * %0 is returned, otherwise a negative error code is returned.
+ */
+static int write_orph_nodes(struct ubifs_info *c, int atomic)
+{
+        int err;
+        while (c->cmt_orphans > 0) {
+                err = write_orph_node(c, atomic);
+                if (err)
+                        return err;
+        }
+        if (atomic) {
+                int lnum;
+                /* Unmap any unused LEBs after consolidation */
+                lnum = c->ohead_lnum + 1;
+                for (lnum = c->ohead_lnum + 1; lnum <= c->orph_last; lnum++) {
+                        err = ubifs_leb_unmap(c, lnum);
+                        if (err)
+                                return err;
+                }
+        }
+        return 0;
+}
+/**
+ * consolidate - consolidate the orphan area.
+ * @c: UBIFS file-system description object
+ *
+ * This function enables consolidation by putting all the orphans into the list
+ * to commit. The list is in the order that the orphans were added, and the
+ * LEBs are written atomically in order, so at no time can orphans be lost by
+ * an unclean unmount.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int consolidate(struct ubifs_info *c)
+{
+        int tot_avail = tot_avail_orphs(c), err = 0;
+        spin_lock(&c->orphan_lock);
+        dbg_cmt("there is space for %d orphans and there are %d",
+                tot_avail, c->tot_orphans);
+        if (c->tot_orphans - c->new_orphans <= tot_avail) {
+                struct ubifs_orphan *orphan, **last;
+                int cnt = 0;
+                /* Change the cnext list to include all non-new orphans */
+                last = &c->orph_cnext;
+                list_for_each_entry(orphan, &c->orph_list, list) {
+                        if (orphan->new)
+                                continue;
+                        *last = orphan;
+                        last = &orphan->cnext;
+                        cnt += 1;
+                }
+                *last = orphan->cnext;
+                ubifs_assert(cnt == c->tot_orphans - c->new_orphans);
+                c->cmt_orphans = cnt;
+                c->ohead_lnum = c->orph_first;
+                c->ohead_offs = 0;
+        } else {
+                /*
+                 * We limit the number of orphans so that this should
+                 * never happen.
+                 */
+                ubifs_err("out of space in orphan area");
+                err = -EINVAL;
+        }
+        spin_unlock(&c->orphan_lock);
+        return err;
+}
+/**
+ * commit_orphans - commit orphans.
+ * @c: UBIFS file-system description object
+ *
+ * This function commits orphans to flash. On success, %0 is returned,
+ * otherwise a negative error code is returned.
+ */
+static int commit_orphans(struct ubifs_info *c)
+{
+        int avail, atomic = 0, err;
+        ubifs_assert(c->cmt_orphans > 0);
+        avail = avail_orphs(c);
+        if (avail < c->cmt_orphans) {
+                /* Not enough space to write new orphans, so consolidate */
+                err = consolidate(c);
+                if (err)
+                        return err;
+                atomic = 1;
+        }
+        err = write_orph_nodes(c, atomic);
+        return err;
+}
+/**
+ * erase_deleted - erase the orphans marked for deletion.
+ * @c: UBIFS file-system description object
+ *
+ * During commit, the orphans being committed cannot be deleted, so they are
+ * marked for deletion and deleted by this function. Also, the recovery
+ * adds killed orphans to the deletion list, and therefore they are deleted
+ * here too.
+ */
+static void erase_deleted(struct ubifs_info *c)
+{
+        struct ubifs_orphan *orphan, *dnext;
+        spin_lock(&c->orphan_lock);
+        dnext = c->orph_dnext;
+        while (dnext) {
+                orphan = dnext;
+                dnext = orphan->dnext;
+                ubifs_assert(!orphan->new);
+                rb_erase(&orphan->rb, &c->orph_tree);
+                list_del(&orphan->list);
+                c->tot_orphans -= 1;
+                dbg_gen("deleting orphan ino %lu", orphan->inum);
+                kfree(orphan);
+        }
+        c->orph_dnext = NULL;
+        spin_unlock(&c->orphan_lock);
+}
+/**
+ * ubifs_orphan_end_commit - end commit of orphans.
+ * @c: UBIFS file-system description object
+ *
+ * End commit of orphans.
+ */
+int ubifs_orphan_end_commit(struct ubifs_info *c)
+{
+        int err;
+        if (c->cmt_orphans != 0) {
+                err = commit_orphans(c);
+                if (err)
+                        return err;
+        }
+        erase_deleted(c);
+        err = dbg_check_orphans(c);
+        return err;
+}
+/**
+ * clear_orphans - erase all LEBs used for orphans.
+ * @c: UBIFS file-system description object
+ *
+ * If recovery is not required, then the orphans from the previous session
+ * are not needed. This function locates the LEBs used to record
+ * orphans, and un-maps them.
+ */
+static int clear_orphans(struct ubifs_info *c)
+{
+        int lnum, err;
+        for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
+                err = ubifs_leb_unmap(c, lnum);
+                if (err)
+                        return err;
+        }
+        c->ohead_lnum = c->orph_first;
+        c->ohead_offs = 0;
+        return 0;
+}
+/**
+ * insert_dead_orphan - insert an orphan.
+ * @c: UBIFS file-system description object
+ * @inum: orphan inode number
+ *
+ * This function is a helper to the 'do_kill_orphans()' function. The orphan
+ * must be kept until the next commit, so it is added to the rb-tree and the
+ * deletion list.
+ */
+static int insert_dead_orphan(struct ubifs_info *c, ino_t inum)
+{
+        struct ubifs_orphan *orphan, *o;
+        struct rb_node **p, *parent = NULL;
+        orphan = kzalloc(sizeof(struct ubifs_orphan), GFP_KERNEL);
+        if (!orphan)
+                return -ENOMEM;
+        orphan->inum = inum;
+        p = &c->orph_tree.rb_node;
+        while (*p) {
+                parent = *p;
+                o = rb_entry(parent, struct ubifs_orphan, rb);
+                if (inum < o->inum)
+                        p = &(*p)->rb_left;
+                else if (inum > o->inum)
+                        p = &(*p)->rb_right;
+                else {
+                        /* Already added - no problem */
+                        kfree(orphan);
+                        return 0;
+                }
+        }
+        c->tot_orphans += 1;
+        rb_link_node(&orphan->rb, parent, p);
+        rb_insert_color(&orphan->rb, &c->orph_tree);
+        list_add_tail(&orphan->list, &c->orph_list);
+        orphan->dnext = c->orph_dnext;
+        c->orph_dnext = orphan;
+        dbg_mnt("ino %lu, new %d, tot %d",
+                inum, c->new_orphans, c->tot_orphans);
+        return 0;
+}
+/**
+ * do_kill_orphans - remove orphan inodes from the index.
+ * @c: UBIFS file-system description object
+ * @sleb: scanned LEB
+ * @last_cmt_no: cmt_no of last orph node read is passed and returned here
+ * @outofdate: whether the LEB is out of date is returned here
+ * @last_flagged: whether the end orph node is encountered
+ *
+ * This function is a helper to the 'kill_orphans()' function. It goes through
+ * every orphan node in a LEB and for every inode number recorded, removes
+ * all keys for that inode from the TNC.
+ */
+static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+                           unsigned long long *last_cmt_no, int *outofdate,
+                           int *last_flagged)
+{
+        struct ubifs_scan_node *snod;
+        struct ubifs_orph_node *orph;
+        unsigned long long cmt_no;
+        ino_t inum;
+        int i, n, err, first = 1;
+        list_for_each_entry(snod, &sleb->nodes, list) {
+                if (snod->type != UBIFS_ORPH_NODE) {
+                        ubifs_err("invalid node type %d in orphan area at "
+                                  "%d:%d", snod->type, sleb->lnum, snod->offs);
+                        dbg_dump_node(c, snod->node);
+                        return -EINVAL;
+                }
+                orph = snod->node;
+                /* Check commit number */
+                cmt_no = le64_to_cpu(orph->cmt_no) & LLONG_MAX;
+                /*
+                 * The commit number on the master node may be less, because
+                 * of a failed commit. If there are several failed commits in a
+                 * row, the commit number written on orph nodes will continue to
+                 * increase (because the commit number is adjusted here) even
+                 * though the commit number on the master node stays the same
+                 * because the master node has not been re-written.
+                 */
+                if (cmt_no > c->cmt_no)
+                        c->cmt_no = cmt_no;
+                if (cmt_no < *last_cmt_no && *last_flagged) {
+                        /*
+                         * The last orph node had a higher commit number and was
+                         * flagged as the last written for that commit number.
+                         * That makes this orph node, out of date.
+                         */
+                        if (!first) {
+                                ubifs_err("out of order commit number %llu in "
+                                          "orphan node at %d:%d",
+                                          cmt_no, sleb->lnum, snod->offs);
+                                dbg_dump_node(c, snod->node);
+                                return -EINVAL;
+                        }
+                        dbg_rcvry("out of date LEB %d", sleb->lnum);
+                        *outofdate = 1;
+                        return 0;
+                }
+                if (first)
+                        first = 0;
+                n = (le32_to_cpu(orph->ch.len) - UBIFS_ORPH_NODE_SZ) >> 3;
+                for (i = 0; i < n; i++) {
+                        inum = le64_to_cpu(orph->inos[i]);
+                        dbg_rcvry("deleting orphaned inode %lu", inum);
+                        err = ubifs_tnc_remove_ino(c, inum);
+                        if (err)
+                                return err;
+                        err = insert_dead_orphan(c, inum);
+                        if (err)
+                                return err;
+                }
+                *last_cmt_no = cmt_no;
+                if (le64_to_cpu(orph->cmt_no) & (1ULL << 63)) {
+                        dbg_rcvry("last orph node for commit %llu at %d:%d",
+                                  cmt_no, sleb->lnum, snod->offs);
+                        *last_flagged = 1;
+                } else
+                        *last_flagged = 0;
+        }
+        return 0;
+}
+/**
+ * kill_orphans - remove all orphan inodes from the index.
+ * @c: UBIFS file-system description object
+ *
+ * If recovery is required, then orphan inodes recorded during the previous
+ * session (which ended with an unclean unmount) must be deleted from the index.
+ * This is done by updating the TNC, but since the index is not updated until
+ * the next commit, the LEBs where the orphan information is recorded are not
+ * erased until the next commit.
+ */
+static int kill_orphans(struct ubifs_info *c)
+{
+        unsigned long long last_cmt_no = 0;
+        int lnum, err = 0, outofdate = 0, last_flagged = 0;
+        c->ohead_lnum = c->orph_first;
+        c->ohead_offs = 0;
+        /* Check no-orphans flag and skip this if no orphans */
+        if (c->no_orphs) {
+                dbg_rcvry("no orphans");
+                return 0;
+        }
+        /*
+         * Orph nodes always start at c->orph_first and are written to each
+         * successive LEB in turn. Generally unused LEBs will have been unmapped
+         * but may contain out of date orph nodes if the unmap didn't go
+         * through. In addition, the last orph node written for each commit is
+         * marked (top bit of orph->cmt_no is set to 1). It is possible that
+         * there are orph nodes from the next commit (i.e. the commit did not
+         * complete successfully). In that case, no orphans will have been lost
+         * due to the way that orphans are written, and any orphans added will
+         * be valid orphans anyway and so can be deleted.
+         */
+        for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
+                struct ubifs_scan_leb *sleb;
+                dbg_rcvry("LEB %d", lnum);
+                sleb = ubifs_scan(c, lnum, 0, c->sbuf);
+                if (IS_ERR(sleb)) {
+                        sleb = ubifs_recover_leb(c, lnum, 0, c->sbuf, 0);
+                        if (IS_ERR(sleb)) {
+                                err = PTR_ERR(sleb);
+                                break;
+                        }
+                }
+                err = do_kill_orphans(c, sleb, &last_cmt_no, &outofdate,
+                                      &last_flagged);
+                if (err || outofdate) {
+                        ubifs_scan_destroy(sleb);
+                        break;
+                }
+                if (sleb->endpt) {
+                        c->ohead_lnum = lnum;
+                        c->ohead_offs = sleb->endpt;
+                }
+                ubifs_scan_destroy(sleb);
+        }
+        return err;
+}
+/**
+ * ubifs_mount_orphans - delete orphan inodes and erase LEBs that recorded them.
+ * @c: UBIFS file-system description object
+ * @unclean: indicates recovery from unclean unmount
+ * @read_only: indicates read only mount
+ *
+ * This function is called when mounting to erase orphans from the previous
+ * session. If UBIFS was not unmounted cleanly, then the inodes recorded as
+ * orphans are deleted.
+ */
+int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only)
+{
+        int err = 0;
+        c->max_orphans = tot_avail_orphs(c);
+        if (!read_only) {
+                c->orph_buf = vmalloc(c->leb_size);
+                if (!c->orph_buf)
+                        return -ENOMEM;
+        }
+        if (unclean)
+                err = kill_orphans(c);
+        else if (!read_only)
+                err = clear_orphans(c);
+        return err;
+}
+#ifdef CONFIG_UBIFS_FS_DEBUG
+struct check_orphan {
+        struct rb_node rb;
+        ino_t inum;
+};
+struct check_info {
+        unsigned long last_ino;
+        unsigned long tot_inos;
+        unsigned long missing;
+        unsigned long long leaf_cnt;
+        struct ubifs_ino_node *node;
+        struct rb_root root;
+};
+static int dbg_find_orphan(struct ubifs_info *c, ino_t inum)
+{
+        struct ubifs_orphan *o;
+        struct rb_node *p;
+        spin_lock(&c->orphan_lock);
+        p = c->orph_tree.rb_node;
+        while (p) {
+                o = rb_entry(p, struct ubifs_orphan, rb);
+                if (inum < o->inum)
+                        p = p->rb_left;
+                else if (inum > o->inum)
+                        p = p->rb_right;
+                else {
+                        spin_unlock(&c->orphan_lock);
+                        return 1;
+                }
+        }
+        spin_unlock(&c->orphan_lock);
+        return 0;
+}
+static int dbg_ins_check_orphan(struct rb_root *root, ino_t inum)
+{
+        struct check_orphan *orphan, *o;
+        struct rb_node **p, *parent = NULL;
+        orphan = kzalloc(sizeof(struct check_orphan), GFP_NOFS);
+        if (!orphan)
+                return -ENOMEM;
+        orphan->inum = inum;
+        p = &root->rb_node;
+        while (*p) {
+                parent = *p;
+                o = rb_entry(parent, struct check_orphan, rb);
+                if (inum < o->inum)
+                        p = &(*p)->rb_left;
+                else if (inum > o->inum)
+                        p = &(*p)->rb_right;
+                else {
+                        kfree(orphan);
+                        return 0;
+                }
+        }
+        rb_link_node(&orphan->rb, parent, p);
+        rb_insert_color(&orphan->rb, root);
+        return 0;
+}
+static int dbg_find_check_orphan(struct rb_root *root, ino_t inum)
+{
+        struct check_orphan *o;
+        struct rb_node *p;
+        p = root->rb_node;
+        while (p) {
+                o = rb_entry(p, struct check_orphan, rb);
+                if (inum < o->inum)
+                        p = p->rb_left;
+                else if (inum > o->inum)
+                        p = p->rb_right;
+                else
+                        return 1;
+        }
+        return 0;
+}
+static void dbg_free_check_tree(struct rb_root *root)
+{
+        struct rb_node *this = root->rb_node;
+        struct check_orphan *o;
+        while (this) {
+                if (this->rb_left) {
+                        this = this->rb_left;
+                        continue;
+                } else if (this->rb_right) {
+                        this = this->rb_right;
+                        continue;
+                }
+                o = rb_entry(this, struct check_orphan, rb);
+                this = rb_parent(this);
+                if (this) {
+                        if (this->rb_left == &o->rb)
+                                this->rb_left = NULL;
+                        else
+                                this->rb_right = NULL;
+                }
+                kfree(o);
+        }
+}
+static int dbg_orphan_check(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+                            void *priv)
+{
+        struct check_info *ci = priv;
+        ino_t inum;
+        int err;
+        inum = key_inum(c, &zbr->key);
+        if (inum != ci->last_ino) {
+                /* Lowest node type is the inode node, so it comes first */
+                if (key_type(c, &zbr->key) != UBIFS_INO_KEY)
+                        ubifs_err("found orphan node ino %lu, type %d", inum,
+                                  key_type(c, &zbr->key));
+                ci->last_ino = inum;
+                ci->tot_inos += 1;
+                err = ubifs_tnc_read_node(c, zbr, ci->node);
+                if (err) {
+                        ubifs_err("node read failed, error %d", err);
+                        return err;
+                }
+                if (ci->node->nlink == 0)
+                        /* Must be recorded as an orphan */
+                        if (!dbg_find_check_orphan(&ci->root, inum) &&
+                            !dbg_find_orphan(c, inum)) {
+                                ubifs_err("missing orphan, ino %lu", inum);
+                                ci->missing += 1;
+                        }
+        }
+        ci->leaf_cnt += 1;
+        return 0;
+}
+static int dbg_read_orphans(struct check_info *ci, struct ubifs_scan_leb *sleb)
+{
+        struct ubifs_scan_node *snod;
+        struct ubifs_orph_node *orph;
+        ino_t inum;
+        int i, n, err;
+        list_for_each_entry(snod, &sleb->nodes, list) {
+                cond_resched();
+                if (snod->type != UBIFS_ORPH_NODE)
+                        continue;
+                orph = snod->node;
+                n = (le32_to_cpu(orph->ch.len) - UBIFS_ORPH_NODE_SZ) >> 3;
+                for (i = 0; i < n; i++) {
+                        inum = le64_to_cpu(orph->inos[i]);
+                        err = dbg_ins_check_orphan(&ci->root, inum);
+                        if (err)
+                                return err;
+                }
+        }
+        return 0;
+}
+static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci)
+{
+        int lnum, err = 0;
+        /* Check no-orphans flag and skip this if no orphans */
+        if (c->no_orphs)
+                return 0;
+        for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
+                struct ubifs_scan_leb *sleb;
+                sleb = ubifs_scan(c, lnum, 0, c->dbg_buf);
+                if (IS_ERR(sleb)) {
+                        err = PTR_ERR(sleb);
+                        break;
+                }
+                err = dbg_read_orphans(ci, sleb);
+                ubifs_scan_destroy(sleb);
+                if (err)
+                        break;
+        }
+        return err;
+}
+static int dbg_check_orphans(struct ubifs_info *c)
+{
+        struct check_info ci;
+        int err;
+        if (!(ubifs_chk_flags & UBIFS_CHK_ORPH))
+                return 0;
+        ci.last_ino = 0;
+        ci.tot_inos = 0;
+        ci.missing  = 0;
+        ci.leaf_cnt = 0;
+        ci.root = RB_ROOT;
+        ci.node = kmalloc(UBIFS_MAX_INO_NODE_SZ, GFP_NOFS);
+        if (!ci.node) {
+                ubifs_err("out of memory");
+                return -ENOMEM;
+        }
+        err = dbg_scan_orphans(c, &ci);
+        if (err)
+                goto out;
+        err = dbg_walk_index(c, &dbg_orphan_check, NULL, &ci);
+        if (err) {
+                ubifs_err("cannot scan TNC, error %d", err);
+                goto out;
+        }
+        if (ci.missing) {
+                ubifs_err("%lu missing orphan(s)", ci.missing);
+                err = -EINVAL;
+                goto out;
+        }
+        dbg_cmt("last inode number is %lu", ci.last_ino);
+        dbg_cmt("total number of inodes is %lu", ci.tot_inos);
+        dbg_cmt("total number of leaf nodes is %llu", ci.leaf_cnt);
+out:
+        dbg_free_check_tree(&ci.root);
+        kfree(ci.node);
+        return err;
+}
+#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
new file mode 100644
index 000000000000..77d26c141cf6
--- /dev/null
+++ b/fs/ubifs/recovery.c
@@ -0,0 +1,1519 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+/*
+ * This file implements functions needed to recover from unclean un-mounts.
+ * When UBIFS is mounted, it checks a flag on the master node to determine if
+ * an un-mount was completed sucessfully. If not, the process of mounting
+ * incorparates additional checking and fixing of on-flash data structures.
+ * UBIFS always cleans away all remnants of an unclean un-mount, so that
+ * errors do not accumulate. However UBIFS defers recovery if it is mounted
+ * read-only, and the flash is not modified in that case.
+ */
+#include <linux/crc32.h>
+#include "ubifs.h"
+/**
+ * is_empty - determine whether a buffer is empty (contains all 0xff).
+ * @buf: buffer to clean
+ * @len: length of buffer
+ *
+ * This function returns %1 if the buffer is empty (contains all 0xff) otherwise
+ * %0 is returned.
+ */
+static int is_empty(void *buf, int len)
+{
+        uint8_t *p = buf;
+        int i;
+        for (i = 0; i < len; i++)
+                if (*p++ != 0xff)
+                        return 0;
+        return 1;
+}
+/**
+ * get_master_node - get the last valid master node allowing for corruption.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number
+ * @pbuf: buffer containing the LEB read, is returned here
+ * @mst: master node, if found, is returned here
+ * @cor: corruption, if found, is returned here
+ *
+ * This function allocates a buffer, reads the LEB into it, and finds and
+ * returns the last valid master node allowing for one area of corruption.
+ * The corrupt area, if there is one, must be consistent with the assumption
+ * that it is the result of an unclean unmount while the master node was being
+ * written. Under those circumstances, it is valid to use the previously written
+ * master node.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int get_master_node(const struct ubifs_info *c, int lnum, void **pbuf,
+                           struct ubifs_mst_node **mst, void **cor)
+{
+        const int sz = c->mst_node_alsz;
+        int err, offs, len;
+        void *sbuf, *buf;
+        sbuf = vmalloc(c->leb_size);
+        if (!sbuf)
+                return -ENOMEM;
+        err = ubi_read(c->ubi, lnum, sbuf, 0, c->leb_size);
+        if (err && err != -EBADMSG)
+                goto out_free;
+        /* Find the first position that is definitely not a node */
+        offs = 0;
+        buf = sbuf;
+        len = c->leb_size;
+        while (offs + UBIFS_MST_NODE_SZ <= c->leb_size) {
+                struct ubifs_ch *ch = buf;
+                if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC)
+                        break;
+                offs += sz;
+                buf  += sz;
+                len  -= sz;
+        }
+        /* See if there was a valid master node before that */
+        if (offs) {
+                int ret;
+                offs -= sz;
+                buf  -= sz;
+                len  += sz;
+                ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
+                if (ret != SCANNED_A_NODE && offs) {
+                        /* Could have been corruption so check one place back */
+                        offs -= sz;
+                        buf  -= sz;
+                        len  += sz;
+                        ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
+                        if (ret != SCANNED_A_NODE)
+                                /*
+                                 * We accept only one area of corruption because
+                                 * we are assuming that it was caused while
+                                 * trying to write a master node.
+                                 */
+                                goto out_err;
+                }
+                if (ret == SCANNED_A_NODE) {
+                        struct ubifs_ch *ch = buf;
+                        if (ch->node_type != UBIFS_MST_NODE)
+                                goto out_err;
+                        dbg_rcvry("found a master node at %d:%d", lnum, offs);
+                        *mst = buf;
+                        offs += sz;
+                        buf  += sz;
+                        len  -= sz;
+                }
+        }
+        /* Check for corruption */
+        if (offs < c->leb_size) {
+                if (!is_empty(buf, min_t(int, len, sz))) {
+                        *cor = buf;
+                        dbg_rcvry("found corruption at %d:%d", lnum, offs);
+                }
+                offs += sz;
+                buf  += sz;
+                len  -= sz;
+        }
+        /* Check remaining empty space */
+        if (offs < c->leb_size)
+                if (!is_empty(buf, len))
+                        goto out_err;
+        *pbuf = sbuf;
+        return 0;
+out_err:
+        err = -EINVAL;
+out_free:
+        vfree(sbuf);
+        *mst = NULL;
+        *cor = NULL;
+        return err;
+}
+/**
+ * write_rcvrd_mst_node - write recovered master node.
+ * @c: UBIFS file-system description object
+ * @mst: master node
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int write_rcvrd_mst_node(struct ubifs_info *c,
+                                struct ubifs_mst_node *mst)
+{
+        int err = 0, lnum = UBIFS_MST_LNUM, sz = c->mst_node_alsz;
+        uint32_t save_flags;
+        dbg_rcvry("recovery");
+        save_flags = mst->flags;
+        mst->flags = cpu_to_le32(le32_to_cpu(mst->flags) | UBIFS_MST_RCVRY);
+        ubifs_prepare_node(c, mst, UBIFS_MST_NODE_SZ, 1);
+        err = ubi_leb_change(c->ubi, lnum, mst, sz, UBI_SHORTTERM);
+        if (err)
+                goto out;
+        err = ubi_leb_change(c->ubi, lnum + 1, mst, sz, UBI_SHORTTERM);
+        if (err)
+                goto out;
+out:
+        mst->flags = save_flags;
+        return err;
+}
+/**
+ * ubifs_recover_master_node - recover the master node.
+ * @c: UBIFS file-system description object
+ *
+ * This function recovers the master node from corruption that may occur due to
+ * an unclean unmount.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_recover_master_node(struct ubifs_info *c)
+{
+        void *buf1 = NULL, *buf2 = NULL, *cor1 = NULL, *cor2 = NULL;
+        struct ubifs_mst_node *mst1 = NULL, *mst2 = NULL, *mst;
+        const int sz = c->mst_node_alsz;
+        int err, offs1, offs2;
+        dbg_rcvry("recovery");
+        err = get_master_node(c, UBIFS_MST_LNUM, &buf1, &mst1, &cor1);
+        if (err)
+                goto out_free;
+        err = get_master_node(c, UBIFS_MST_LNUM + 1, &buf2, &mst2, &cor2);
+        if (err)
+                goto out_free;
+        if (mst1) {
+                offs1 = (void *)mst1 - buf1;
+                if ((le32_to_cpu(mst1->flags) & UBIFS_MST_RCVRY) &&
+                    (offs1 == 0 && !cor1)) {
+                        /*
+                         * mst1 was written by recovery at offset 0 with no
+                         * corruption.
+                         */
+                        dbg_rcvry("recovery recovery");
+                        mst = mst1;
+                } else if (mst2) {
+                        offs2 = (void *)mst2 - buf2;
+                        if (offs1 == offs2) {
+                                /* Same offset, so must be the same */
+                                if (memcmp((void *)mst1 + UBIFS_CH_SZ,
+                                           (void *)mst2 + UBIFS_CH_SZ,
+                                           UBIFS_MST_NODE_SZ - UBIFS_CH_SZ))
+                                        goto out_err;
+                                mst = mst1;
+                        } else if (offs2 + sz == offs1) {
+                                /* 1st LEB was written, 2nd was not */
+                                if (cor1)
+                                        goto out_err;
+                                mst = mst1;
+                        } else if (offs1 == 0 && offs2 + sz >= c->leb_size) {
+                                /* 1st LEB was unmapped and written, 2nd not */
+                                if (cor1)
+                                        goto out_err;
+                                mst = mst1;
+                        } else
+                                goto out_err;
+                } else {
+                        /*
+                         * 2nd LEB was unmapped and about to be written, so
+                         * there must be only one master node in the first LEB
+                         * and no corruption.
+                         */
+                        if (offs1 != 0 || cor1)
+                                goto out_err;
+                        mst = mst1;
+                }
+        } else {
+                if (!mst2)
+                        goto out_err;
+                /*
+                 * 1st LEB was unmapped and about to be written, so there must
+                 * be no room left in 2nd LEB.
+                 */
+                offs2 = (void *)mst2 - buf2;
+                if (offs2 + sz + sz <= c->leb_size)
+                        goto out_err;
+                mst = mst2;
+        }
+        dbg_rcvry("recovered master node from LEB %d",
+                  (mst == mst1 ? UBIFS_MST_LNUM : UBIFS_MST_LNUM + 1));
+        memcpy(c->mst_node, mst, UBIFS_MST_NODE_SZ);
+        if ((c->vfs_sb->s_flags & MS_RDONLY)) {
+                /* Read-only mode. Keep a copy for switching to rw mode */
+                c->rcvrd_mst_node = kmalloc(sz, GFP_KERNEL);
+                if (!c->rcvrd_mst_node) {
+                        err = -ENOMEM;
+                        goto out_free;
+                }
+                memcpy(c->rcvrd_mst_node, c->mst_node, UBIFS_MST_NODE_SZ);
+        } else {
+                /* Write the recovered master node */
+                c->max_sqnum = le64_to_cpu(mst->ch.sqnum) - 1;
+                err = write_rcvrd_mst_node(c, c->mst_node);
+                if (err)
+                        goto out_free;
+        }
+        vfree(buf2);
+        vfree(buf1);
+        return 0;
+out_err:
+        err = -EINVAL;
+out_free:
+        ubifs_err("failed to recover master node");
+        if (mst1) {
+                dbg_err("dumping first master node");
+                dbg_dump_node(c, mst1);
+        }
+        if (mst2) {
+                dbg_err("dumping second master node");
+                dbg_dump_node(c, mst2);
+        }
+        vfree(buf2);
+        vfree(buf1);
+        return err;
+}
+/**
+ * ubifs_write_rcvrd_mst_node - write the recovered master node.
+ * @c: UBIFS file-system description object
+ *
+ * This function writes the master node that was recovered during mounting in
+ * read-only mode and must now be written because we are remounting rw.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_write_rcvrd_mst_node(struct ubifs_info *c)
+{
+        int err;
+        if (!c->rcvrd_mst_node)
+                return 0;
+        c->rcvrd_mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY);
+        c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY);
+        err = write_rcvrd_mst_node(c, c->rcvrd_mst_node);
+        if (err)
+                return err;
+        kfree(c->rcvrd_mst_node);
+        c->rcvrd_mst_node = NULL;
+        return 0;
+}
+/**
+ * is_last_write - determine if an offset was in the last write to a LEB.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to check
+ * @offs: offset to check
+ *
+ * This function returns %1 if @offs was in the last write to the LEB whose data
+ * is in @buf, otherwise %0 is returned.  The determination is made by checking
+ * for subsequent empty space starting from the next min_io_size boundary (or a
+ * bit less than the common header size if min_io_size is one).
+ */
+static int is_last_write(const struct ubifs_info *c, void *buf, int offs)
+{
+        int empty_offs;
+        int check_len;
+        uint8_t *p;
+        if (c->min_io_size == 1) {
+                check_len = c->leb_size - offs;
+                p = buf + check_len;
+                for (; check_len > 0; check_len--)
+                        if (*--p != 0xff)
+                                break;
+                /*
+                 * 'check_len' is the size of the corruption which cannot be
+                 * more than the size of 1 node if it was caused by an unclean
+                 * unmount.
+                 */
+                if (check_len > UBIFS_MAX_NODE_SZ)
+                        return 0;
+                return 1;
+        }
+        /*
+         * Round up to the next c->min_io_size boundary i.e. 'offs' is in the
+         * last wbuf written. After that should be empty space.
+         */
+        empty_offs = ALIGN(offs + 1, c->min_io_size);
+        check_len = c->leb_size - empty_offs;
+        p = buf + empty_offs - offs;
+        for (; check_len > 0; check_len--)
+                if (*p++ != 0xff)
+                        return 0;
+        return 1;
+}
+/**
+ * clean_buf - clean the data from an LEB sitting in a buffer.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to clean
+ * @lnum: LEB number to clean
+ * @offs: offset from which to clean
+ * @len: length of buffer
+ *
+ * This function pads up to the next min_io_size boundary (if there is one) and
+ * sets empty space to all 0xff. @buf, @offs and @len are updated to the next
+ * min_io_size boundary (if there is one).
+ */
+static void clean_buf(const struct ubifs_info *c, void **buf, int lnum,
+                      int *offs, int *len)
+{
+        int empty_offs, pad_len;
+        lnum = lnum;
+        dbg_rcvry("cleaning corruption at %d:%d", lnum, *offs);
+        if (c->min_io_size == 1) {
+                memset(*buf, 0xff, c->leb_size - *offs);
+                return;
+        }
+        ubifs_assert(!(*offs & 7));
+        empty_offs = ALIGN(*offs, c->min_io_size);
+        pad_len = empty_offs - *offs;
+        ubifs_pad(c, *buf, pad_len);
+        *offs += pad_len;
+        *buf += pad_len;
+        *len -= pad_len;
+        memset(*buf, 0xff, c->leb_size - empty_offs);
+}
+/**
+ * no_more_nodes - determine if there are no more nodes in a buffer.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to check
+ * @len: length of buffer
+ * @lnum: LEB number of the LEB from which @buf was read
+ * @offs: offset from which @buf was read
+ *
+ * This function scans @buf for more nodes and returns %0 is a node is found and
+ * %1 if no more nodes are found.
+ */
+static int no_more_nodes(const struct ubifs_info *c, void *buf, int len,
+                        int lnum, int offs)
+{
+        int skip, next_offs = 0;
+        if (len > UBIFS_DATA_NODE_SZ) {
+                struct ubifs_ch *ch = buf;
+                int dlen = le32_to_cpu(ch->len);
+                if (ch->node_type == UBIFS_DATA_NODE && dlen >= UBIFS_CH_SZ &&
+                    dlen <= UBIFS_MAX_DATA_NODE_SZ)
+                        /* The corrupt node looks like a data node */
+                        next_offs = ALIGN(offs + dlen, 8);
+        }
+        if (c->min_io_size == 1)
+                skip = 8;
+        else
+                skip = ALIGN(offs + 1, c->min_io_size) - offs;
+        offs += skip;
+        buf += skip;
+        len -= skip;
+        while (len > 8) {
+                struct ubifs_ch *ch = buf;
+                uint32_t magic = le32_to_cpu(ch->magic);
+                int ret;
+                if (magic == UBIFS_NODE_MAGIC) {
+                        ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
+                        if (ret == SCANNED_A_NODE || ret > 0) {
+                                /*
+                                 * There is a small chance this is just data in
+                                 * a data node, so check that possibility. e.g.
+                                 * this is part of a file that itself contains
+                                 * a UBIFS image.
+                                 */
+                                if (next_offs && offs + le32_to_cpu(ch->len) <=
+                                    next_offs)
+                                        continue;
+                                dbg_rcvry("unexpected node at %d:%d", lnum,
+                                          offs);
+                                return 0;
+                        }
+                }
+                offs += 8;
+                buf += 8;
+                len -= 8;
+        }
+        return 1;
+}
+/**
+ * fix_unclean_leb - fix an unclean LEB.
+ * @c: UBIFS file-system description object
+ * @sleb: scanned LEB information
+ * @start: offset where scan started
+ */
+static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+                           int start)
+{
+        int lnum = sleb->lnum, endpt = start;
+        /* Get the end offset of the last node we are keeping */
+        if (!list_empty(&sleb->nodes)) {
+                struct ubifs_scan_node *snod;
+                snod = list_entry(sleb->nodes.prev,
+                                  struct ubifs_scan_node, list);
+                endpt = snod->offs + snod->len;
+        }
+        if ((c->vfs_sb->s_flags & MS_RDONLY) && !c->remounting_rw) {
+                /* Add to recovery list */
+                struct ubifs_unclean_leb *ucleb;
+                dbg_rcvry("need to fix LEB %d start %d endpt %d",
+                          lnum, start, sleb->endpt);
+                ucleb = kzalloc(sizeof(struct ubifs_unclean_leb), GFP_NOFS);
+                if (!ucleb)
+                        return -ENOMEM;
+                ucleb->lnum = lnum;
+                ucleb->endpt = endpt;
+                list_add_tail(&ucleb->list, &c->unclean_leb_list);
+        } else {
+                /* Write the fixed LEB back to flash */
+                int err;
+                dbg_rcvry("fixing LEB %d start %d endpt %d",
+                          lnum, start, sleb->endpt);
+                if (endpt == 0) {
+                        err = ubifs_leb_unmap(c, lnum);
+                        if (err)
+                                return err;
+                } else {
+                        int len = ALIGN(endpt, c->min_io_size);
+                        if (start) {
+                                err = ubi_read(c->ubi, lnum, sleb->buf, 0,
+                                               start);
+                                if (err)
+                                        return err;
+                        }
+                        /* Pad to min_io_size */
+                        if (len > endpt) {
+                                int pad_len = len - ALIGN(endpt, 8);
+                                if (pad_len > 0) {
+                                        void *buf = sleb->buf + len - pad_len;
+                                        ubifs_pad(c, buf, pad_len);
+                                }
+                        }
+                        err = ubi_leb_change(c->ubi, lnum, sleb->buf, len,
+                                             UBI_UNKNOWN);
+                        if (err)
+                                return err;
+                }
+        }
+        return 0;
+}
+/**
+ * drop_incomplete_group - drop nodes from an incomplete group.
+ * @sleb: scanned LEB information
+ * @offs: offset of dropped nodes is returned here
+ *
+ * This function returns %1 if nodes are dropped and %0 otherwise.
+ */
+static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs)
+{
+        int dropped = 0;
+        while (!list_empty(&sleb->nodes)) {
+                struct ubifs_scan_node *snod;
+                struct ubifs_ch *ch;
+                snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node,
+                                  list);
+                ch = snod->node;
+                if (ch->group_type != UBIFS_IN_NODE_GROUP)
+                        return dropped;
+                dbg_rcvry("dropping node at %d:%d", sleb->lnum, snod->offs);
+                *offs = snod->offs;
+                list_del(&snod->list);
+                kfree(snod);
+                sleb->nodes_cnt -= 1;
+                dropped = 1;
+        }
+        return dropped;
+}
+/**
+ * ubifs_recover_leb - scan and recover a LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number
+ * @offs: offset
+ * @sbuf: LEB-sized buffer to use
+ * @grouped: nodes may be grouped for recovery
+ *
+ * This function does a scan of a LEB, but caters for errors that might have
+ * been caused by the unclean unmount from which we are attempting to recover.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
+                                         int offs, void *sbuf, int grouped)
+{
+        int err, len = c->leb_size - offs, need_clean = 0, quiet = 1;
+        int empty_chkd = 0, start = offs;
+        struct ubifs_scan_leb *sleb;
+        void *buf = sbuf + offs;
+        dbg_rcvry("%d:%d", lnum, offs);
+        sleb = ubifs_start_scan(c, lnum, offs, sbuf);
+        if (IS_ERR(sleb))
+                return sleb;
+        if (sleb->ecc)
+                need_clean = 1;
+        while (len >= 8) {
+                int ret;
+                dbg_scan("look at LEB %d:%d (%d bytes left)",
+                         lnum, offs, len);
+                cond_resched();
+                /*
+                 * Scan quietly until there is an error from which we cannot
+                 * recover
+                 */
+                ret = ubifs_scan_a_node(c, buf, len, lnum, offs, quiet);
+                if (ret == SCANNED_A_NODE) {
+                        /* A valid node, and not a padding node */
+                        struct ubifs_ch *ch = buf;
+                        int node_len;
+                        err = ubifs_add_snod(c, sleb, buf, offs);
+                        if (err)
+                                goto error;
+                        node_len = ALIGN(le32_to_cpu(ch->len), 8);
+                        offs += node_len;
+                        buf += node_len;
+                        len -= node_len;
+                        continue;
+                }
+                if (ret > 0) {
+                        /* Padding bytes or a valid padding node */
+                        offs += ret;
+                        buf += ret;
+                        len -= ret;
+                        continue;
+                }
+                if (ret == SCANNED_EMPTY_SPACE) {
+                        if (!is_empty(buf, len)) {
+                                if (!is_last_write(c, buf, offs))
+                                        break;
+                                clean_buf(c, &buf, lnum, &offs, &len);
+                                need_clean = 1;
+                        }
+                        empty_chkd = 1;
+                        break;
+                }
+                if (ret == SCANNED_GARBAGE || ret == SCANNED_A_BAD_PAD_NODE)
+                        if (is_last_write(c, buf, offs)) {
+                                clean_buf(c, &buf, lnum, &offs, &len);
+                                need_clean = 1;
+                                empty_chkd = 1;
+                                break;
+                        }
+                if (ret == SCANNED_A_CORRUPT_NODE)
+                        if (no_more_nodes(c, buf, len, lnum, offs)) {
+                                clean_buf(c, &buf, lnum, &offs, &len);
+                                need_clean = 1;
+                                empty_chkd = 1;
+                                break;
+                        }
+                if (quiet) {
+                        /* Redo the last scan but noisily */
+                        quiet = 0;
+                        continue;
+                }
+                switch (ret) {
+                case SCANNED_GARBAGE:
+                        dbg_err("garbage");
+                        goto corrupted;
+                case SCANNED_A_CORRUPT_NODE:
+                case SCANNED_A_BAD_PAD_NODE:
+                        dbg_err("bad node");
+                        goto corrupted;
+                default:
+                        dbg_err("unknown");
+                        goto corrupted;
+                }
+        }
+        if (!empty_chkd && !is_empty(buf, len)) {
+                if (is_last_write(c, buf, offs)) {
+                        clean_buf(c, &buf, lnum, &offs, &len);
+                        need_clean = 1;
+                } else {
+                        ubifs_err("corrupt empty space at LEB %d:%d",
+                                  lnum, offs);
+                        goto corrupted;
+                }
+        }
+        /* Drop nodes from incomplete group */
+        if (grouped && drop_incomplete_group(sleb, &offs)) {
+                buf = sbuf + offs;
+                len = c->leb_size - offs;
+                clean_buf(c, &buf, lnum, &offs, &len);
+                need_clean = 1;
+        }
+        if (offs % c->min_io_size) {
+                clean_buf(c, &buf, lnum, &offs, &len);
+                need_clean = 1;
+        }
+        ubifs_end_scan(c, sleb, lnum, offs);
+        if (need_clean) {
+                err = fix_unclean_leb(c, sleb, start);
+                if (err)
+                        goto error;
+        }
+        return sleb;
+corrupted:
+        ubifs_scanned_corruption(c, lnum, offs, buf);
+        err = -EUCLEAN;
+error:
+        ubifs_err("LEB %d scanning failed", lnum);
+        ubifs_scan_destroy(sleb);
+        return ERR_PTR(err);
+}
+/**
+ * get_cs_sqnum - get commit start sequence number.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number of commit start node
+ * @offs: offset of commit start node
+ * @cs_sqnum: commit start sequence number is returned here
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int get_cs_sqnum(struct ubifs_info *c, int lnum, int offs,
+                        unsigned long long *cs_sqnum)
+{
+        struct ubifs_cs_node *cs_node = NULL;
+        int err, ret;
+        dbg_rcvry("at %d:%d", lnum, offs);
+        cs_node = kmalloc(UBIFS_CS_NODE_SZ, GFP_KERNEL);
+        if (!cs_node)
+                return -ENOMEM;
+        if (c->leb_size - offs < UBIFS_CS_NODE_SZ)
+                goto out_err;
+        err = ubi_read(c->ubi, lnum, (void *)cs_node, offs, UBIFS_CS_NODE_SZ);
+        if (err && err != -EBADMSG)
+                goto out_free;
+        ret = ubifs_scan_a_node(c, cs_node, UBIFS_CS_NODE_SZ, lnum, offs, 0);
+        if (ret != SCANNED_A_NODE) {
+                dbg_err("Not a valid node");
+                goto out_err;
+        }
+        if (cs_node->ch.node_type != UBIFS_CS_NODE) {
+                dbg_err("Node a CS node, type is %d", cs_node->ch.node_type);
+                goto out_err;
+        }
+        if (le64_to_cpu(cs_node->cmt_no) != c->cmt_no) {
+                dbg_err("CS node cmt_no %llu != current cmt_no %llu",
+                        (unsigned long long)le64_to_cpu(cs_node->cmt_no),
+                        c->cmt_no);
+                goto out_err;
+        }
+        *cs_sqnum = le64_to_cpu(cs_node->ch.sqnum);
+        dbg_rcvry("commit start sqnum %llu", *cs_sqnum);
+        kfree(cs_node);
+        return 0;
+out_err:
+        err = -EINVAL;
+out_free:
+        ubifs_err("failed to get CS sqnum");
+        kfree(cs_node);
+        return err;
+}
+/**
+ * ubifs_recover_log_leb - scan and recover a log LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number
+ * @offs: offset
+ * @sbuf: LEB-sized buffer to use
+ *
+ * This function does a scan of a LEB, but caters for errors that might have
+ * been caused by the unclean unmount from which we are attempting to recover.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
+                                             int offs, void *sbuf)
+{
+        struct ubifs_scan_leb *sleb;
+        int next_lnum;
+        dbg_rcvry("LEB %d", lnum);
+        next_lnum = lnum + 1;
+        if (next_lnum >= UBIFS_LOG_LNUM + c->log_lebs)
+                next_lnum = UBIFS_LOG_LNUM;
+        if (next_lnum != c->ltail_lnum) {
+                /*
+                 * We can only recover at the end of the log, so check that the
+                 * next log LEB is empty or out of date.
+                 */
+                sleb = ubifs_scan(c, next_lnum, 0, sbuf);
+                if (IS_ERR(sleb))
+                        return sleb;
+                if (sleb->nodes_cnt) {
+                        struct ubifs_scan_node *snod;
+                        unsigned long long cs_sqnum = c->cs_sqnum;
+                        snod = list_entry(sleb->nodes.next,
+                                          struct ubifs_scan_node, list);
+                        if (cs_sqnum == 0) {
+                                int err;
+                                err = get_cs_sqnum(c, lnum, offs, &cs_sqnum);
+                                if (err) {
+                                        ubifs_scan_destroy(sleb);
+                                        return ERR_PTR(err);
+                                }
+                        }
+                        if (snod->sqnum > cs_sqnum) {
+                                ubifs_err("unrecoverable log corruption "
+                                          "in LEB %d", lnum);
+                                ubifs_scan_destroy(sleb);
+                                return ERR_PTR(-EUCLEAN);
+                        }
+                }
+                ubifs_scan_destroy(sleb);
+        }
+        return ubifs_recover_leb(c, lnum, offs, sbuf, 0);
+}
+/**
+ * recover_head - recover a head.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number of head to recover
+ * @offs: offset of head to recover
+ * @sbuf: LEB-sized buffer to use
+ *
+ * This function ensures that there is no data on the flash at a head location.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int recover_head(const struct ubifs_info *c, int lnum, int offs,
+                        void *sbuf)
+{
+        int len, err, need_clean = 0;
+        if (c->min_io_size > 1)
+                len = c->min_io_size;
+        else
+                len = 512;
+        if (offs + len > c->leb_size)
+                len = c->leb_size - offs;
+        if (!len)
+                return 0;
+        /* Read at the head location and check it is empty flash */
+        err = ubi_read(c->ubi, lnum, sbuf, offs, len);
+        if (err)
+                need_clean = 1;
+        else {
+                uint8_t *p = sbuf;
+                while (len--)
+                        if (*p++ != 0xff) {
+                                need_clean = 1;
+                                break;
+                        }
+        }
+        if (need_clean) {
+                dbg_rcvry("cleaning head at %d:%d", lnum, offs);
+                if (offs == 0)
+                        return ubifs_leb_unmap(c, lnum);
+                err = ubi_read(c->ubi, lnum, sbuf, 0, offs);
+                if (err)
+                        return err;
+                return ubi_leb_change(c->ubi, lnum, sbuf, offs, UBI_UNKNOWN);
+        }
+        return 0;
+}
+/**
+ * ubifs_recover_inl_heads - recover index and LPT heads.
+ * @c: UBIFS file-system description object
+ * @sbuf: LEB-sized buffer to use
+ *
+ * This function ensures that there is no data on the flash at the index and
+ * LPT head locations.
+ *
+ * This deals with the recovery of a half-completed journal commit. UBIFS is
+ * careful never to overwrite the last version of the index or the LPT. Because
+ * the index and LPT are wandering trees, data from a half-completed commit will
+ * not be referenced anywhere in UBIFS. The data will be either in LEBs that are
+ * assumed to be empty and will be unmapped anyway before use, or in the index
+ * and LPT heads.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf)
+{
+        int err;
+        ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY) || c->remounting_rw);
+        dbg_rcvry("checking index head at %d:%d", c->ihead_lnum, c->ihead_offs);
+        err = recover_head(c, c->ihead_lnum, c->ihead_offs, sbuf);
+        if (err)
+                return err;
+        dbg_rcvry("checking LPT head at %d:%d", c->nhead_lnum, c->nhead_offs);
+        err = recover_head(c, c->nhead_lnum, c->nhead_offs, sbuf);
+        if (err)
+                return err;
+        return 0;
+}
+/**
+ *  clean_an_unclean_leb - read and write a LEB to remove corruption.
+ * @c: UBIFS file-system description object
+ * @ucleb: unclean LEB information
+ * @sbuf: LEB-sized buffer to use
+ *
+ * This function reads a LEB up to a point pre-determined by the mount recovery,
+ * checks the nodes, and writes the result back to the flash, thereby cleaning
+ * off any following corruption, or non-fatal ECC errors.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int clean_an_unclean_leb(const struct ubifs_info *c,
+                                struct ubifs_unclean_leb *ucleb, void *sbuf)
+{
+        int err, lnum = ucleb->lnum, offs = 0, len = ucleb->endpt, quiet = 1;
+        void *buf = sbuf;
+        dbg_rcvry("LEB %d len %d", lnum, len);
+        if (len == 0) {
+                /* Nothing to read, just unmap it */
+                err = ubifs_leb_unmap(c, lnum);
+                if (err)
+                        return err;
+                return 0;
+        }
+        err = ubi_read(c->ubi, lnum, buf, offs, len);
+        if (err && err != -EBADMSG)
+                return err;
+        while (len >= 8) {
+                int ret;
+                cond_resched();
+                /* Scan quietly until there is an error */
+                ret = ubifs_scan_a_node(c, buf, len, lnum, offs, quiet);
+                if (ret == SCANNED_A_NODE) {
+                        /* A valid node, and not a padding node */
+                        struct ubifs_ch *ch = buf;
+                        int node_len;
+                        node_len = ALIGN(le32_to_cpu(ch->len), 8);
+                        offs += node_len;
+                        buf += node_len;
+                        len -= node_len;
+                        continue;
+                }
+                if (ret > 0) {
+                        /* Padding bytes or a valid padding node */
+                        offs += ret;
+                        buf += ret;
+                        len -= ret;
+                        continue;
+                }
+                if (ret == SCANNED_EMPTY_SPACE) {
+                        ubifs_err("unexpected empty space at %d:%d",
+                                  lnum, offs);
+                        return -EUCLEAN;
+                }
+                if (quiet) {
+                        /* Redo the last scan but noisily */
+                        quiet = 0;
+                        continue;
+                }
+                ubifs_scanned_corruption(c, lnum, offs, buf);
+                return -EUCLEAN;
+        }
+        /* Pad to min_io_size */
+        len = ALIGN(ucleb->endpt, c->min_io_size);
+        if (len > ucleb->endpt) {
+                int pad_len = len - ALIGN(ucleb->endpt, 8);
+                if (pad_len > 0) {
+                        buf = c->sbuf + len - pad_len;
+                        ubifs_pad(c, buf, pad_len);
+                }
+        }
+        /* Write back the LEB atomically */
+        err = ubi_leb_change(c->ubi, lnum, sbuf, len, UBI_UNKNOWN);
+        if (err)
+                return err;
+        dbg_rcvry("cleaned LEB %d", lnum);
+        return 0;
+}
+/**
+ * ubifs_clean_lebs - clean LEBs recovered during read-only mount.
+ * @c: UBIFS file-system description object
+ * @sbuf: LEB-sized buffer to use
+ *
+ * This function cleans a LEB identified during recovery that needs to be
+ * written but was not because UBIFS was mounted read-only. This happens when
+ * remounting to read-write mode.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_clean_lebs(const struct ubifs_info *c, void *sbuf)
+{
+        dbg_rcvry("recovery");
+        while (!list_empty(&c->unclean_leb_list)) {
+                struct ubifs_unclean_leb *ucleb;
+                int err;
+                ucleb = list_entry(c->unclean_leb_list.next,
+                                   struct ubifs_unclean_leb, list);
+                err = clean_an_unclean_leb(c, ucleb, sbuf);
+                if (err)
+                        return err;
+                list_del(&ucleb->list);
+                kfree(ucleb);
+        }
+        return 0;
+}
+/**
+ * ubifs_rcvry_gc_commit - recover the GC LEB number and run the commit.
+ * @c: UBIFS file-system description object
+ *
+ * Out-of-place garbage collection requires always one empty LEB with which to
+ * start garbage collection. The LEB number is recorded in c->gc_lnum and is
+ * written to the master node on unmounting. In the case of an unclean unmount
+ * the value of gc_lnum recorded in the master node is out of date and cannot
+ * be used. Instead, recovery must allocate an empty LEB for this purpose.
+ * However, there may not be enough empty space, in which case it must be
+ * possible to GC the dirtiest LEB into the GC head LEB.
+ *
+ * This function also runs the commit which causes the TNC updates from
+ * size-recovery and orphans to be written to the flash. That is important to
+ * ensure correct replay order for subsequent mounts.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_rcvry_gc_commit(struct ubifs_info *c)
+{
+        struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
+        struct ubifs_lprops lp;
+        int lnum, err;
+        c->gc_lnum = -1;
+        if (wbuf->lnum == -1) {
+                dbg_rcvry("no GC head LEB");
+                goto find_free;
+        }
+        /*
+         * See whether the used space in the dirtiest LEB fits in the GC head
+         * LEB.
+         */
+        if (wbuf->offs == c->leb_size) {
+                dbg_rcvry("no room in GC head LEB");
+                goto find_free;
+        }
+        err = ubifs_find_dirty_leb(c, &lp, wbuf->offs, 2);
+        if (err) {
+                if (err == -ENOSPC)
+                        dbg_err("could not find a dirty LEB");
+                return err;
+        }
+        ubifs_assert(!(lp.flags & LPROPS_INDEX));
+        lnum = lp.lnum;
+        if (lp.free + lp.dirty == c->leb_size) {
+                /* An empty LEB was returned */
+                if (lp.free != c->leb_size) {
+                        err = ubifs_change_one_lp(c, lnum, c->leb_size,
+                                                  0, 0, 0, 0);
+                        if (err)
+                                return err;
+                }
+                err = ubifs_leb_unmap(c, lnum);
+                if (err)
+                        return err;
+                c->gc_lnum = lnum;
+                dbg_rcvry("allocated LEB %d for GC", lnum);
+                /* Run the commit */
+                dbg_rcvry("committing");
+                return ubifs_run_commit(c);
+        }
+        /*
+         * There was no empty LEB so the used space in the dirtiest LEB must fit
+         * in the GC head LEB.
+         */
+        if (lp.free + lp.dirty < wbuf->offs) {
+                dbg_rcvry("LEB %d doesn't fit in GC head LEB %d:%d",
+                          lnum, wbuf->lnum, wbuf->offs);
+                err = ubifs_return_leb(c, lnum);
+                if (err)
+                        return err;
+                goto find_free;
+        }
+        /*
+         * We run the commit before garbage collection otherwise subsequent
+         * mounts will see the GC and orphan deletion in a different order.
+         */
+        dbg_rcvry("committing");
+        err = ubifs_run_commit(c);
+        if (err)
+                return err;
+        /*
+         * The data in the dirtiest LEB fits in the GC head LEB, so do the GC
+         * - use locking to keep 'ubifs_assert()' happy.
+         */
+        dbg_rcvry("GC'ing LEB %d", lnum);
+        mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+        err = ubifs_garbage_collect_leb(c, &lp);
+        if (err >= 0) {
+                int err2 = ubifs_wbuf_sync_nolock(wbuf);
+                if (err2)
+                        err = err2;
+        }
+        mutex_unlock(&wbuf->io_mutex);
+        if (err < 0) {
+                dbg_err("GC failed, error %d", err);
+                if (err == -EAGAIN)
+                        err = -EINVAL;
+                return err;
+        }
+        if (err != LEB_RETAINED) {
+                dbg_err("GC returned %d", err);
+                return -EINVAL;
+        }
+        err = ubifs_leb_unmap(c, c->gc_lnum);
+        if (err)
+                return err;
+        dbg_rcvry("allocated LEB %d for GC", lnum);
+        return 0;
+find_free:
+        /*
+         * There is no GC head LEB or the free space in the GC head LEB is too
+         * small. Allocate gc_lnum by calling 'ubifs_find_free_leb_for_idx()' so
+         * GC is not run.
+         */
+        lnum = ubifs_find_free_leb_for_idx(c);
+        if (lnum < 0) {
+                dbg_err("could not find an empty LEB");
+                return lnum;
+        }
+        /* And reset the index flag */
+        err = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
+                                  LPROPS_INDEX, 0);
+        if (err)
+                return err;
+        c->gc_lnum = lnum;
+        dbg_rcvry("allocated LEB %d for GC", lnum);
+        /* Run the commit */
+        dbg_rcvry("committing");
+        return ubifs_run_commit(c);
+}
+/**
+ * struct size_entry - inode size information for recovery.
+ * @rb: link in the RB-tree of sizes
+ * @inum: inode number
+ * @i_size: size on inode
+ * @d_size: maximum size based on data nodes
+ * @exists: indicates whether the inode exists
+ * @inode: inode if pinned in memory awaiting rw mode to fix it
+ */
+struct size_entry {
+        struct rb_node rb;
+        ino_t inum;
+        loff_t i_size;
+        loff_t d_size;
+        int exists;
+        struct inode *inode;
+};
+/**
+ * add_ino - add an entry to the size tree.
+ * @c: UBIFS file-system description object
+ * @inum: inode number
+ * @i_size: size on inode
+ * @d_size: maximum size based on data nodes
+ * @exists: indicates whether the inode exists
+ */
+static int add_ino(struct ubifs_info *c, ino_t inum, loff_t i_size,
+                   loff_t d_size, int exists)
+{
+        struct rb_node **p = &c->size_tree.rb_node, *parent = NULL;
+        struct size_entry *e;
+        while (*p) {
+                parent = *p;
+                e = rb_entry(parent, struct size_entry, rb);
+                if (inum < e->inum)
+                        p = &(*p)->rb_left;
+                else
+                        p = &(*p)->rb_right;
+        }
+        e = kzalloc(sizeof(struct size_entry), GFP_KERNEL);
+        if (!e)
+                return -ENOMEM;
+        e->inum = inum;
+        e->i_size = i_size;
+        e->d_size = d_size;
+        e->exists = exists;
+        rb_link_node(&e->rb, parent, p);
+        rb_insert_color(&e->rb, &c->size_tree);
+        return 0;
+}
+/**
+ * find_ino - find an entry on the size tree.
+ * @c: UBIFS file-system description object
+ * @inum: inode number
+ */
+static struct size_entry *find_ino(struct ubifs_info *c, ino_t inum)
+{
+        struct rb_node *p = c->size_tree.rb_node;
+        struct size_entry *e;
+        while (p) {
+                e = rb_entry(p, struct size_entry, rb);
+                if (inum < e->inum)
+                        p = p->rb_left;
+                else if (inum > e->inum)
+                        p = p->rb_right;
+                else
+                        return e;
+        }
+        return NULL;
+}
+/**
+ * remove_ino - remove an entry from the size tree.
+ * @c: UBIFS file-system description object
+ * @inum: inode number
+ */
+static void remove_ino(struct ubifs_info *c, ino_t inum)
+{
+        struct size_entry *e = find_ino(c, inum);
+        if (!e)
+                return;
+        rb_erase(&e->rb, &c->size_tree);
+        kfree(e);
+}
+/**
+ * ubifs_destroy_size_tree - free resources related to the size tree.
+ * @c: UBIFS file-system description object
+ */
+void ubifs_destroy_size_tree(struct ubifs_info *c)
+{
+        struct rb_node *this = c->size_tree.rb_node;
+        struct size_entry *e;
+        while (this) {
+                if (this->rb_left) {
+                        this = this->rb_left;
+                        continue;
+                } else if (this->rb_right) {
+                        this = this->rb_right;
+                        continue;
+                }
+                e = rb_entry(this, struct size_entry, rb);
+                if (e->inode)
+                        iput(e->inode);
+                this = rb_parent(this);
+                if (this) {
+                        if (this->rb_left == &e->rb)
+                                this->rb_left = NULL;
+                        else
+                                this->rb_right = NULL;
+                }
+                kfree(e);
+        }
+        c->size_tree = RB_ROOT;
+}
+/**
+ * ubifs_recover_size_accum - accumulate inode sizes for recovery.
+ * @c: UBIFS file-system description object
+ * @key: node key
+ * @deletion: node is for a deletion
+ * @new_size: inode size
+ *
+ * This function has two purposes:
+ *     1) to ensure there are no data nodes that fall outside the inode size
+ *     2) to ensure there are no data nodes for inodes that do not exist
+ * To accomplish those purposes, a rb-tree is constructed containing an entry
+ * for each inode number in the journal that has not been deleted, and recording
+ * the size from the inode node, the maximum size of any data node (also altered
+ * by truncations) and a flag indicating a inode number for which no inode node
+ * was present in the journal.
+ *
+ * Note that there is still the possibility that there are data nodes that have
+ * been committed that are beyond the inode size, however the only way to find
+ * them would be to scan the entire index. Alternatively, some provision could
+ * be made to record the size of inodes at the start of commit, which would seem
+ * very cumbersome for a scenario that is quite unlikely and the only negative
+ * consequence of which is wasted space.
+ *
+ * This functions returns %0 on success and a negative error code on failure.
+ */
+int ubifs_recover_size_accum(struct ubifs_info *c, union ubifs_key *key,
+                             int deletion, loff_t new_size)
+{
+        ino_t inum = key_inum(c, key);
+        struct size_entry *e;
+        int err;
+        switch (key_type(c, key)) {
+        case UBIFS_INO_KEY:
+                if (deletion)
+                        remove_ino(c, inum);
+                else {
+                        e = find_ino(c, inum);
+                        if (e) {
+                                e->i_size = new_size;
+                                e->exists = 1;
+                        } else {
+                                err = add_ino(c, inum, new_size, 0, 1);
+                                if (err)
+                                        return err;
+                        }
+                }
+                break;
+        case UBIFS_DATA_KEY:
+                e = find_ino(c, inum);
+                if (e) {
+                        if (new_size > e->d_size)
+                                e->d_size = new_size;
+                } else {
+                        err = add_ino(c, inum, 0, new_size, 0);
+                        if (err)
+                                return err;
+                }
+                break;
+        case UBIFS_TRUN_KEY:
+                e = find_ino(c, inum);
+                if (e)
+                        e->d_size = new_size;
+                break;
+        }
+        return 0;
+}
+/**
+ * fix_size_in_place - fix inode size in place on flash.
+ * @c: UBIFS file-system description object
+ * @e: inode size information for recovery
+ */
+static int fix_size_in_place(struct ubifs_info *c, struct size_entry *e)
+{
+        struct ubifs_ino_node *ino = c->sbuf;
+        unsigned char *p;
+        union ubifs_key key;
+        int err, lnum, offs, len;
+        loff_t i_size;
+        uint32_t crc;
+        /* Locate the inode node LEB number and offset */
+        ino_key_init(c, &key, e->inum);
+        err = ubifs_tnc_locate(c, &key, ino, &lnum, &offs);
+        if (err)
+                goto out;
+        /*
+         * If the size recorded on the inode node is greater than the size that
+         * was calculated from nodes in the journal then don't change the inode.
+         */
+        i_size = le64_to_cpu(ino->size);
+        if (i_size >= e->d_size)
+                return 0;
+        /* Read the LEB */
+        err = ubi_read(c->ubi, lnum, c->sbuf, 0, c->leb_size);
+        if (err)
+                goto out;
+        /* Change the size field and recalculate the CRC */
+        ino = c->sbuf + offs;
+        ino->size = cpu_to_le64(e->d_size);
+        len = le32_to_cpu(ino->ch.len);
+        crc = crc32(UBIFS_CRC32_INIT, (void *)ino + 8, len - 8);
+        ino->ch.crc = cpu_to_le32(crc);
+        /* Work out where data in the LEB ends and free space begins */
+        p = c->sbuf;
+        len = c->leb_size - 1;
+        while (p[len] == 0xff)
+                len -= 1;
+        len = ALIGN(len + 1, c->min_io_size);
+        /* Atomically write the fixed LEB back again */
+        err = ubi_leb_change(c->ubi, lnum, c->sbuf, len, UBI_UNKNOWN);
+        if (err)
+                goto out;
+        dbg_rcvry("inode %lu at %d:%d size %lld -> %lld ", e->inum, lnum, offs,
+                  i_size, e->d_size);
+        return 0;
+out:
+        ubifs_warn("inode %lu failed to fix size %lld -> %lld error %d",
+                   e->inum, e->i_size, e->d_size, err);
+        return err;
+}
+/**
+ * ubifs_recover_size - recover inode size.
+ * @c: UBIFS file-system description object
+ *
+ * This function attempts to fix inode size discrepancies identified by the
+ * 'ubifs_recover_size_accum()' function.
+ *
+ * This functions returns %0 on success and a negative error code on failure.
+ */
+int ubifs_recover_size(struct ubifs_info *c)
+{
+        struct rb_node *this = rb_first(&c->size_tree);
+        while (this) {
+                struct size_entry *e;
+                int err;
+                e = rb_entry(this, struct size_entry, rb);
+                if (!e->exists) {
+                        union ubifs_key key;
+                        ino_key_init(c, &key, e->inum);
+                        err = ubifs_tnc_lookup(c, &key, c->sbuf);
+                        if (err && err != -ENOENT)
+                                return err;
+                        if (err == -ENOENT) {
+                                /* Remove data nodes that have no inode */
+                                dbg_rcvry("removing ino %lu", e->inum);
+                                err = ubifs_tnc_remove_ino(c, e->inum);
+                                if (err)
+                                        return err;
+                        } else {
+                                struct ubifs_ino_node *ino = c->sbuf;
+                                e->exists = 1;
+                                e->i_size = le64_to_cpu(ino->size);
+                        }
+                }
+                if (e->exists && e->i_size < e->d_size) {
+                        if (!e->inode && (c->vfs_sb->s_flags & MS_RDONLY)) {
+                                /* Fix the inode size and pin it in memory */
+                                struct inode *inode;
+                                inode = ubifs_iget(c->vfs_sb, e->inum);
+                                if (IS_ERR(inode))
+                                        return PTR_ERR(inode);
+                                if (inode->i_size < e->d_size) {
+                                        dbg_rcvry("ino %lu size %lld -> %lld",
+                                                  e->inum, e->d_size,
+                                                  inode->i_size);
+                                        inode->i_size = e->d_size;
+                                        ubifs_inode(inode)->ui_size = e->d_size;
+                                        e->inode = inode;
+                                        this = rb_next(this);
+                                        continue;
+                                }
+                                iput(inode);
+                        } else {
+                                /* Fix the size in place */
+                                err = fix_size_in_place(c, e);
+                                if (err)
+                                        return err;
+                                if (e->inode)
+                                        iput(e->inode);
+                        }
+                }
+                this = rb_next(this);
+                rb_erase(&e->rb, &c->size_tree);
+                kfree(e);
+        }
+        return 0;
+}
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
new file mode 100644
index 000000000000..7399692af859
--- /dev/null
+++ b/fs/ubifs/replay.c
@@ -0,0 +1,1075 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+/*
+ * This file contains journal replay code. It runs when the file-system is being
+ * mounted and requires no locking.
+ *
+ * The larger is the journal, the longer it takes to scan it, so the longer it
+ * takes to mount UBIFS. This is why the journal has limited size which may be
+ * changed depending on the system requirements. But a larger journal gives
+ * faster I/O speed because it writes the index less frequently. So this is a
+ * trade-off. Also, the journal is indexed by the in-memory index (TNC), so the
+ * larger is the journal, the more memory its index may consume.
+ */
+#include "ubifs.h"
+/*
+ * Replay flags.
+ *
+ * REPLAY_DELETION: node was deleted
+ * REPLAY_REF: node is a reference node
+ */
+enum {
+        REPLAY_DELETION = 1,
+        REPLAY_REF = 2,
+};
+/**
+ * struct replay_entry - replay tree entry.
+ * @lnum: logical eraseblock number of the node
+ * @offs: node offset
+ * @len: node length
+ * @sqnum: node sequence number
+ * @flags: replay flags
+ * @rb: links the replay tree
+ * @key: node key
+ * @nm: directory entry name
+ * @old_size: truncation old size
+ * @new_size: truncation new size
+ * @free: amount of free space in a bud
+ * @dirty: amount of dirty space in a bud from padding and deletion nodes
+ *
+ * UBIFS journal replay must compare node sequence numbers, which means it must
+ * build a tree of node information to insert into the TNC.
+ */
+struct replay_entry {
+        int lnum;
+        int offs;
+        int len;
+        unsigned long long sqnum;
+        int flags;
+        struct rb_node rb;
+        union ubifs_key key;
+        union {
+                struct qstr nm;
+                struct {
+                        loff_t old_size;
+                        loff_t new_size;
+                };
+                struct {
+                        int free;
+                        int dirty;
+                };
+        };
+};
+/**
+ * struct bud_entry - entry in the list of buds to replay.
+ * @list: next bud in the list
+ * @bud: bud description object
+ * @free: free bytes in the bud
+ * @sqnum: reference node sequence number
+ */
+struct bud_entry {
+        struct list_head list;
+        struct ubifs_bud *bud;
+        int free;
+        unsigned long long sqnum;
+};
+/**
+ * set_bud_lprops - set free and dirty space used by a bud.
+ * @c: UBIFS file-system description object
+ * @r: replay entry of bud
+ */
+static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
+{
+        const struct ubifs_lprops *lp;
+        int err = 0, dirty;
+        ubifs_get_lprops(c);
+        lp = ubifs_lpt_lookup_dirty(c, r->lnum);
+        if (IS_ERR(lp)) {
+                err = PTR_ERR(lp);
+                goto out;
+        }
+        dirty = lp->dirty;
+        if (r->offs == 0 && (lp->free != c->leb_size || lp->dirty != 0)) {
+                /*
+                 * The LEB was added to the journal with a starting offset of
+                 * zero which means the LEB must have been empty. The LEB
+                 * property values should be lp->free == c->leb_size and
+                 * lp->dirty == 0, but that is not the case. The reason is that
+                 * the LEB was garbage collected. The garbage collector resets
+                 * the free and dirty space without recording it anywhere except
+                 * lprops, so if there is not a commit then lprops does not have
+                 * that information next time the file system is mounted.
+                 *
+                 * We do not need to adjust free space because the scan has told
+                 * us the exact value which is recorded in the replay entry as
+                 * r->free.
+                 *
+                 * However we do need to subtract from the dirty space the
+                 * amount of space that the garbage collector reclaimed, which
+                 * is the whole LEB minus the amount of space that was free.
+                 */
+                dbg_mnt("bud LEB %d was GC'd (%d free, %d dirty)", r->lnum,
+                        lp->free, lp->dirty);
+                dbg_gc("bud LEB %d was GC'd (%d free, %d dirty)", r->lnum,
+                        lp->free, lp->dirty);
+                dirty -= c->leb_size - lp->free;
+                /*
+                 * If the replay order was perfect the dirty space would now be
+                 * zero. The order is not perfect because the the journal heads
+                 * race with eachother. This is not a problem but is does mean
+                 * that the dirty space may temporarily exceed c->leb_size
+                 * during the replay.
+                 */
+                if (dirty != 0)
+                        dbg_msg("LEB %d lp: %d free %d dirty "
+                                "replay: %d free %d dirty", r->lnum, lp->free,
+                                lp->dirty, r->free, r->dirty);
+        }
+        lp = ubifs_change_lp(c, lp, r->free, dirty + r->dirty,
+                             lp->flags | LPROPS_TAKEN, 0);
+        if (IS_ERR(lp)) {
+                err = PTR_ERR(lp);
+                goto out;
+        }
+out:
+        ubifs_release_lprops(c);
+        return err;
+}
+/**
+ * trun_remove_range - apply a replay entry for a truncation to the TNC.
+ * @c: UBIFS file-system description object
+ * @r: replay entry of truncation
+ */
+static int trun_remove_range(struct ubifs_info *c, struct replay_entry *r)
+{
+        unsigned min_blk, max_blk;
+        union ubifs_key min_key, max_key;
+        ino_t ino;
+        min_blk = r->new_size / UBIFS_BLOCK_SIZE;
+        if (r->new_size & (UBIFS_BLOCK_SIZE - 1))
+                min_blk += 1;
+        max_blk = r->old_size / UBIFS_BLOCK_SIZE;
+        if ((r->old_size & (UBIFS_BLOCK_SIZE - 1)) == 0)
+                max_blk -= 1;
+        ino = key_inum(c, &r->key);
+        data_key_init(c, &min_key, ino, min_blk);
+        data_key_init(c, &max_key, ino, max_blk);
+        return ubifs_tnc_remove_range(c, &min_key, &max_key);
+}
+/**
+ * apply_replay_entry - apply a replay entry to the TNC.
+ * @c: UBIFS file-system description object
+ * @r: replay entry to apply
+ *
+ * Apply a replay entry to the TNC.
+ */
+static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
+{
+        int err, deletion = ((r->flags & REPLAY_DELETION) != 0);
+        dbg_mnt("LEB %d:%d len %d flgs %d sqnum %llu %s", r->lnum,
+                r->offs, r->len, r->flags, r->sqnum, DBGKEY(&r->key));
+        /* Set c->replay_sqnum to help deal with dangling branches. */
+        c->replay_sqnum = r->sqnum;
+        if (r->flags & REPLAY_REF)
+                err = set_bud_lprops(c, r);
+        else if (is_hash_key(c, &r->key)) {
+                if (deletion)
+                        err = ubifs_tnc_remove_nm(c, &r->key, &r->nm);
+                else
+                        err = ubifs_tnc_add_nm(c, &r->key, r->lnum, r->offs,
+                                               r->len, &r->nm);
+        } else {
+                if (deletion)
+                        switch (key_type(c, &r->key)) {
+                        case UBIFS_INO_KEY:
+                        {
+                                ino_t inum = key_inum(c, &r->key);
+                                err = ubifs_tnc_remove_ino(c, inum);
+                                break;
+                        }
+                        case UBIFS_TRUN_KEY:
+                                err = trun_remove_range(c, r);
+                                break;
+                        default:
+                                err = ubifs_tnc_remove(c, &r->key);
+                                break;
+                        }
+                else
+                        err = ubifs_tnc_add(c, &r->key, r->lnum, r->offs,
+                                            r->len);
+                if (err)
+                        return err;
+                if (c->need_recovery)
+                        err = ubifs_recover_size_accum(c, &r->key, deletion,
+                                                       r->new_size);
+        }
+        return err;
+}
+/**
+ * destroy_replay_tree - destroy the replay.
+ * @c: UBIFS file-system description object
+ *
+ * Destroy the replay tree.
+ */
+static void destroy_replay_tree(struct ubifs_info *c)
+{
+        struct rb_node *this = c->replay_tree.rb_node;
+        struct replay_entry *r;
+        while (this) {
+                if (this->rb_left) {
+                        this = this->rb_left;
+                        continue;
+                } else if (this->rb_right) {
+                        this = this->rb_right;
+                        continue;
+                }
+                r = rb_entry(this, struct replay_entry, rb);
+                this = rb_parent(this);
+                if (this) {
+                        if (this->rb_left == &r->rb)
+                                this->rb_left = NULL;
+                        else
+                                this->rb_right = NULL;
+                }
+                if (is_hash_key(c, &r->key))
+                        kfree(r->nm.name);
+                kfree(r);
+        }
+        c->replay_tree = RB_ROOT;
+}
+/**
+ * apply_replay_tree - apply the replay tree to the TNC.
+ * @c: UBIFS file-system description object
+ *
+ * Apply the replay tree.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+static int apply_replay_tree(struct ubifs_info *c)
+{
+        struct rb_node *this = rb_first(&c->replay_tree);
+        while (this) {
+                struct replay_entry *r;
+                int err;
+                cond_resched();
+                r = rb_entry(this, struct replay_entry, rb);
+                err = apply_replay_entry(c, r);
+                if (err)
+                        return err;
+                this = rb_next(this);
+        }
+        return 0;
+}
+/**
+ * insert_node - insert a node to the replay tree.
+ * @c: UBIFS file-system description object
+ * @lnum: node logical eraseblock number
+ * @offs: node offset
+ * @len: node length
+ * @key: node key
+ * @sqnum: sequence number
+ * @deletion: non-zero if this is a deletion
+ * @used: number of bytes in use in a LEB
+ * @old_size: truncation old size
+ * @new_size: truncation new size
+ *
+ * This function inserts a scanned non-direntry node to the replay tree. The
+ * replay tree is an RB-tree containing @struct replay_entry elements which are
+ * indexed by the sequence number. The replay tree is applied at the very end
+ * of the replay process. Since the tree is sorted in sequence number order,
+ * the older modifications are applied first. This function returns zero in
+ * case of success and a negative error code in case of failure.
+ */
+static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
+                       union ubifs_key *key, unsigned long long sqnum,
+                       int deletion, int *used, loff_t old_size,
+                       loff_t new_size)
+{
+        struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
+        struct replay_entry *r;
+        if (key_inum(c, key) >= c->highest_inum)
+                c->highest_inum = key_inum(c, key);
+        dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
+        while (*p) {
+                parent = *p;
+                r = rb_entry(parent, struct replay_entry, rb);
+                if (sqnum < r->sqnum) {
+                        p = &(*p)->rb_left;
+                        continue;
+                } else if (sqnum > r->sqnum) {
+                        p = &(*p)->rb_right;
+                        continue;
+                }
+                ubifs_err("duplicate sqnum in replay");
+                return -EINVAL;
+        }
+        r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
+        if (!r)
+                return -ENOMEM;
+        if (!deletion)
+                *used += ALIGN(len, 8);
+        r->lnum = lnum;
+        r->offs = offs;
+        r->len = len;
+        r->sqnum = sqnum;
+        r->flags = (deletion ? REPLAY_DELETION : 0);
+        r->old_size = old_size;
+        r->new_size = new_size;
+        key_copy(c, key, &r->key);
+        rb_link_node(&r->rb, parent, p);
+        rb_insert_color(&r->rb, &c->replay_tree);
+        return 0;
+}
+/**
+ * insert_dent - insert a directory entry node into the replay tree.
+ * @c: UBIFS file-system description object
+ * @lnum: node logical eraseblock number
+ * @offs: node offset
+ * @len: node length
+ * @key: node key
+ * @name: directory entry name
+ * @nlen: directory entry name length
+ * @sqnum: sequence number
+ * @deletion: non-zero if this is a deletion
+ * @used: number of bytes in use in a LEB
+ *
+ * This function inserts a scanned directory entry node to the replay tree.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ *
+ * This function is also used for extended attribute entries because they are
+ * implemented as directory entry nodes.
+ */
+static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len,
+                       union ubifs_key *key, const char *name, int nlen,
+                       unsigned long long sqnum, int deletion, int *used)
+{
+        struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
+        struct replay_entry *r;
+        char *nbuf;
+        if (key_inum(c, key) >= c->highest_inum)
+                c->highest_inum = key_inum(c, key);
+        dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
+        while (*p) {
+                parent = *p;
+                r = rb_entry(parent, struct replay_entry, rb);
+                if (sqnum < r->sqnum) {
+                        p = &(*p)->rb_left;
+                        continue;
+                }
+                if (sqnum > r->sqnum) {
+                        p = &(*p)->rb_right;
+                        continue;
+                }
+                ubifs_err("duplicate sqnum in replay");
+                return -EINVAL;
+        }
+        r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
+        if (!r)
+                return -ENOMEM;
+        nbuf = kmalloc(nlen + 1, GFP_KERNEL);
+        if (!nbuf) {
+                kfree(r);
+                return -ENOMEM;
+        }
+        if (!deletion)
+                *used += ALIGN(len, 8);
+        r->lnum = lnum;
+        r->offs = offs;
+        r->len = len;
+        r->sqnum = sqnum;
+        r->nm.len = nlen;
+        memcpy(nbuf, name, nlen);
+        nbuf[nlen] = '\0';
+        r->nm.name = nbuf;
+        r->flags = (deletion ? REPLAY_DELETION : 0);
+        key_copy(c, key, &r->key);
+        ubifs_assert(!*p);
+        rb_link_node(&r->rb, parent, p);
+        rb_insert_color(&r->rb, &c->replay_tree);
+        return 0;
+}
+/**
+ * ubifs_validate_entry - validate directory or extended attribute entry node.
+ * @c: UBIFS file-system description object
+ * @dent: the node to validate
+ *
+ * This function validates directory or extended attribute entry node @dent.
+ * Returns zero if the node is all right and a %-EINVAL if not.
+ */
+int ubifs_validate_entry(struct ubifs_info *c,
+                         const struct ubifs_dent_node *dent)
+{
+        int key_type = key_type_flash(c, dent->key);
+        int nlen = le16_to_cpu(dent->nlen);
+        if (le32_to_cpu(dent->ch.len) != nlen + UBIFS_DENT_NODE_SZ + 1 ||
+            dent->type >= UBIFS_ITYPES_CNT ||
+            nlen > UBIFS_MAX_NLEN || dent->name[nlen] != 0 ||
+            strnlen(dent->name, nlen) != nlen ||
+            le64_to_cpu(dent->inum) > MAX_INUM) {
+                ubifs_err("bad %s node", key_type == UBIFS_DENT_KEY ?
+                          "directory entry" : "extended attribute entry");
+                return -EINVAL;
+        }
+        if (key_type != UBIFS_DENT_KEY && key_type != UBIFS_XENT_KEY) {
+                ubifs_err("bad key type %d", key_type);
+                return -EINVAL;
+        }
+        return 0;
+}
+/**
+ * replay_bud - replay a bud logical eraseblock.
+ * @c: UBIFS file-system description object
+ * @lnum: bud logical eraseblock number to replay
+ * @offs: bud start offset
+ * @jhead: journal head to which this bud belongs
+ * @free: amount of free space in the bud is returned here
+ * @dirty: amount of dirty space from padding and deletion nodes is returned
+ * here
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
+                      int *free, int *dirty)
+{
+        int err = 0, used = 0;
+        struct ubifs_scan_leb *sleb;
+        struct ubifs_scan_node *snod;
+        struct ubifs_bud *bud;
+        dbg_mnt("replay bud LEB %d, head %d", lnum, jhead);
+        if (c->need_recovery)
+                sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, jhead != GCHD);
+        else
+                sleb = ubifs_scan(c, lnum, offs, c->sbuf);
+        if (IS_ERR(sleb))
+                return PTR_ERR(sleb);
+        /*
+         * The bud does not have to start from offset zero - the beginning of
+         * the 'lnum' LEB may contain previously committed data. One of the
+         * things we have to do in replay is to correctly update lprops with
+         * newer information about this LEB.
+         *
+         * At this point lprops thinks that this LEB has 'c->leb_size - offs'
+         * bytes of free space because it only contain information about
+         * committed data.
+         *
+         * But we know that real amount of free space is 'c->leb_size -
+         * sleb->endpt', and the space in the 'lnum' LEB between 'offs' and
+         * 'sleb->endpt' is used by bud data. We have to correctly calculate
+         * how much of these data are dirty and update lprops with this
+         * information.
+         *
+         * The dirt in that LEB region is comprised of padding nodes, deletion
+         * nodes, truncation nodes and nodes which are obsoleted by subsequent
+         * nodes in this LEB. So instead of calculating clean space, we
+         * calculate used space ('used' variable).
+         */
+        list_for_each_entry(snod, &sleb->nodes, list) {
+                int deletion = 0;
+                cond_resched();
+                if (snod->sqnum >= SQNUM_WATERMARK) {
+                        ubifs_err("file system's life ended");
+                        goto out_dump;
+                }
+                if (snod->sqnum > c->max_sqnum)
+                        c->max_sqnum = snod->sqnum;
+                switch (snod->type) {
+                case UBIFS_INO_NODE:
+                {
+                        struct ubifs_ino_node *ino = snod->node;
+                        loff_t new_size = le64_to_cpu(ino->size);
+                        if (le32_to_cpu(ino->nlink) == 0)
+                                deletion = 1;
+                        err = insert_node(c, lnum, snod->offs, snod->len,
+                                          &snod->key, snod->sqnum, deletion,
+                                          &used, 0, new_size);
+                        break;
+                }
+                case UBIFS_DATA_NODE:
+                {
+                        struct ubifs_data_node *dn = snod->node;
+                        loff_t new_size = le32_to_cpu(dn->size) +
+                                          key_block(c, &snod->key) *
+                                          UBIFS_BLOCK_SIZE;
+                        err = insert_node(c, lnum, snod->offs, snod->len,
+                                          &snod->key, snod->sqnum, deletion,
+                                          &used, 0, new_size);
+                        break;
+                }
+                case UBIFS_DENT_NODE:
+                case UBIFS_XENT_NODE:
+                {
+                        struct ubifs_dent_node *dent = snod->node;
+                        err = ubifs_validate_entry(c, dent);
+                        if (err)
+                                goto out_dump;
+                        err = insert_dent(c, lnum, snod->offs, snod->len,
+                                          &snod->key, dent->name,
+                                          le16_to_cpu(dent->nlen), snod->sqnum,
+                                          !le64_to_cpu(dent->inum), &used);
+                        break;
+                }
+                case UBIFS_TRUN_NODE:
+                {
+                        struct ubifs_trun_node *trun = snod->node;
+                        loff_t old_size = le64_to_cpu(trun->old_size);
+                        loff_t new_size = le64_to_cpu(trun->new_size);
+                        union ubifs_key key;
+                        /* Validate truncation node */
+                        if (old_size < 0 || old_size > c->max_inode_sz ||
+                            new_size < 0 || new_size > c->max_inode_sz ||
+                            old_size <= new_size) {
+                                ubifs_err("bad truncation node");
+                                goto out_dump;
+                        }
+                        /*
+                         * Create a fake truncation key just to use the same
+                         * functions which expect nodes to have keys.
+                         */
+                        trun_key_init(c, &key, le32_to_cpu(trun->inum));
+                        err = insert_node(c, lnum, snod->offs, snod->len,
+                                          &key, snod->sqnum, 1, &used,
+                                          old_size, new_size);
+                        break;
+                }
+                default:
+                        ubifs_err("unexpected node type %d in bud LEB %d:%d",
+                                  snod->type, lnum, snod->offs);
+                        err = -EINVAL;
+                        goto out_dump;
+                }
+                if (err)
+                        goto out;
+        }
+        bud = ubifs_search_bud(c, lnum);
+        if (!bud)
+                BUG();
+        ubifs_assert(sleb->endpt - offs >= used);
+        ubifs_assert(sleb->endpt % c->min_io_size == 0);
+        if (sleb->endpt + c->min_io_size <= c->leb_size &&
+            !(c->vfs_sb->s_flags & MS_RDONLY))
+                err = ubifs_wbuf_seek_nolock(&c->jheads[jhead].wbuf, lnum,
+                                             sleb->endpt, UBI_SHORTTERM);
+        *dirty = sleb->endpt - offs - used;
+        *free = c->leb_size - sleb->endpt;
+out:
+        ubifs_scan_destroy(sleb);
+        return err;
+out_dump:
+        ubifs_err("bad node is at LEB %d:%d", lnum, snod->offs);
+        dbg_dump_node(c, snod->node);
+        ubifs_scan_destroy(sleb);
+        return -EINVAL;
+}
+/**
+ * insert_ref_node - insert a reference node to the replay tree.
+ * @c: UBIFS file-system description object
+ * @lnum: node logical eraseblock number
+ * @offs: node offset
+ * @sqnum: sequence number
+ * @free: amount of free space in bud
+ * @dirty: amount of dirty space from padding and deletion nodes
+ *
+ * This function inserts a reference node to the replay tree and returns zero
+ * in case of success ort a negative error code in case of failure.
+ */
+static int insert_ref_node(struct ubifs_info *c, int lnum, int offs,
+                           unsigned long long sqnum, int free, int dirty)
+{
+        struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
+        struct replay_entry *r;
+        dbg_mnt("add ref LEB %d:%d", lnum, offs);
+        while (*p) {
+                parent = *p;
+                r = rb_entry(parent, struct replay_entry, rb);
+                if (sqnum < r->sqnum) {
+                        p = &(*p)->rb_left;
+                        continue;
+                } else if (sqnum > r->sqnum) {
+                        p = &(*p)->rb_right;
+                        continue;
+                }
+                ubifs_err("duplicate sqnum in replay tree");
+                return -EINVAL;
+        }
+        r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
+        if (!r)
+                return -ENOMEM;
+        r->lnum = lnum;
+        r->offs = offs;
+        r->sqnum = sqnum;
+        r->flags = REPLAY_REF;
+        r->free = free;
+        r->dirty = dirty;
+        rb_link_node(&r->rb, parent, p);
+        rb_insert_color(&r->rb, &c->replay_tree);
+        return 0;
+}
+/**
+ * replay_buds - replay all buds.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int replay_buds(struct ubifs_info *c)
+{
+        struct bud_entry *b;
+        int err, uninitialized_var(free), uninitialized_var(dirty);
+        list_for_each_entry(b, &c->replay_buds, list) {
+                err = replay_bud(c, b->bud->lnum, b->bud->start, b->bud->jhead,
+                                 &free, &dirty);
+                if (err)
+                        return err;
+                err = insert_ref_node(c, b->bud->lnum, b->bud->start, b->sqnum,
+                                      free, dirty);
+                if (err)
+                        return err;
+        }
+        return 0;
+}
+/**
+ * destroy_bud_list - destroy the list of buds to replay.
+ * @c: UBIFS file-system description object
+ */
+static void destroy_bud_list(struct ubifs_info *c)
+{
+        struct bud_entry *b;
+        while (!list_empty(&c->replay_buds)) {
+                b = list_entry(c->replay_buds.next, struct bud_entry, list);
+                list_del(&b->list);
+                kfree(b);
+        }
+}
+/**
+ * add_replay_bud - add a bud to the list of buds to replay.
+ * @c: UBIFS file-system description object
+ * @lnum: bud logical eraseblock number to replay
+ * @offs: bud start offset
+ * @jhead: journal head to which this bud belongs
+ * @sqnum: reference node sequence number
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int add_replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
+                          unsigned long long sqnum)
+{
+        struct ubifs_bud *bud;
+        struct bud_entry *b;
+        dbg_mnt("add replay bud LEB %d:%d, head %d", lnum, offs, jhead);
+        bud = kmalloc(sizeof(struct ubifs_bud), GFP_KERNEL);
+        if (!bud)
+                return -ENOMEM;
+        b = kmalloc(sizeof(struct bud_entry), GFP_KERNEL);
+        if (!b) {
+                kfree(bud);
+                return -ENOMEM;
+        }
+        bud->lnum = lnum;
+        bud->start = offs;
+        bud->jhead = jhead;
+        ubifs_add_bud(c, bud);
+        b->bud = bud;
+        b->sqnum = sqnum;
+        list_add_tail(&b->list, &c->replay_buds);
+        return 0;
+}
+/**
+ * validate_ref - validate a reference node.
+ * @c: UBIFS file-system description object
+ * @ref: the reference node to validate
+ * @ref_lnum: LEB number of the reference node
+ * @ref_offs: reference node offset
+ *
+ * This function returns %1 if a bud reference already exists for the LEB. %0 is
+ * returned if the reference node is new, otherwise %-EINVAL is returned if
+ * validation failed.
+ */
+static int validate_ref(struct ubifs_info *c, const struct ubifs_ref_node *ref)
+{
+        struct ubifs_bud *bud;
+        int lnum = le32_to_cpu(ref->lnum);
+        unsigned int offs = le32_to_cpu(ref->offs);
+        unsigned int jhead = le32_to_cpu(ref->jhead);
+        /*
+         * ref->offs may point to the end of LEB when the journal head points
+         * to the end of LEB and we write reference node for it during commit.
+         * So this is why we require 'offs > c->leb_size'.
+         */
+        if (jhead >= c->jhead_cnt || lnum >= c->leb_cnt ||
+            lnum < c->main_first || offs > c->leb_size ||
+            offs & (c->min_io_size - 1))
+                return -EINVAL;
+        /* Make sure we have not already looked at this bud */
+        bud = ubifs_search_bud(c, lnum);
+        if (bud) {
+                if (bud->jhead == jhead && bud->start <= offs)
+                        return 1;
+                ubifs_err("bud at LEB %d:%d was already referred", lnum, offs);
+                return -EINVAL;
+        }
+        return 0;
+}
+/**
+ * replay_log_leb - replay a log logical eraseblock.
+ * @c: UBIFS file-system description object
+ * @lnum: log logical eraseblock to replay
+ * @offs: offset to start replaying from
+ * @sbuf: scan buffer
+ *
+ * This function replays a log LEB and returns zero in case of success, %1 if
+ * this is the last LEB in the log, and a negative error code in case of
+ * failure.
+ */
+static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
+{
+        int err;
+        struct ubifs_scan_leb *sleb;
+        struct ubifs_scan_node *snod;
+        const struct ubifs_cs_node *node;
+        dbg_mnt("replay log LEB %d:%d", lnum, offs);
+        sleb = ubifs_scan(c, lnum, offs, sbuf);
+        if (IS_ERR(sleb)) {
+                if (c->need_recovery)
+                        sleb = ubifs_recover_log_leb(c, lnum, offs, sbuf);
+                if (IS_ERR(sleb))
+                        return PTR_ERR(sleb);
+        }
+        if (sleb->nodes_cnt == 0) {
+                err = 1;
+                goto out;
+        }
+        node = sleb->buf;
+        snod = list_entry(sleb->nodes.next, struct ubifs_scan_node, list);
+        if (c->cs_sqnum == 0) {
+                /*
+                 * This is the first log LEB we are looking at, make sure that
+                 * the first node is a commit start node. Also record its
+                 * sequence number so that UBIFS can determine where the log
+                 * ends, because all nodes which were have higher sequence
+                 * numbers.
+                 */
+                if (snod->type != UBIFS_CS_NODE) {
+                        dbg_err("first log node at LEB %d:%d is not CS node",
+                                lnum, offs);
+                        goto out_dump;
+                }
+                if (le64_to_cpu(node->cmt_no) != c->cmt_no) {
+                        dbg_err("first CS node at LEB %d:%d has wrong "
+                                "commit number %llu expected %llu",
+                                lnum, offs,
+                                (unsigned long long)le64_to_cpu(node->cmt_no),
+                                c->cmt_no);
+                        goto out_dump;
+                }
+                c->cs_sqnum = le64_to_cpu(node->ch.sqnum);
+                dbg_mnt("commit start sqnum %llu", c->cs_sqnum);
+        }
+        if (snod->sqnum < c->cs_sqnum) {
+                /*
+                 * This means that we reached end of log and now
+                 * look to the older log data, which was already
+                 * committed but the eraseblock was not erased (UBIFS
+                 * only unmaps it). So this basically means we have to
+                 * exit with "end of log" code.
+                 */
+                err = 1;
+                goto out;
+        }
+        /* Make sure the first node sits at offset zero of the LEB */
+        if (snod->offs != 0) {
+                dbg_err("first node is not at zero offset");
+                goto out_dump;
+        }
+        list_for_each_entry(snod, &sleb->nodes, list) {
+                cond_resched();
+                if (snod->sqnum >= SQNUM_WATERMARK) {
+                        ubifs_err("file system's life ended");
+                        goto out_dump;
+                }
+                if (snod->sqnum < c->cs_sqnum) {
+                        dbg_err("bad sqnum %llu, commit sqnum %llu",
+                                snod->sqnum, c->cs_sqnum);
+                        goto out_dump;
+                }
+                if (snod->sqnum > c->max_sqnum)
+                        c->max_sqnum = snod->sqnum;
+                switch (snod->type) {
+                case UBIFS_REF_NODE: {
+                        const struct ubifs_ref_node *ref = snod->node;
+                        err = validate_ref(c, ref);
+                        if (err == 1)
+                                break; /* Already have this bud */
+                        if (err)
+                                goto out_dump;
+                        err = add_replay_bud(c, le32_to_cpu(ref->lnum),
+                                             le32_to_cpu(ref->offs),
+                                             le32_to_cpu(ref->jhead),
+                                             snod->sqnum);
+                        if (err)
+                                goto out;
+                        break;
+                }
+                case UBIFS_CS_NODE:
+                        /* Make sure it sits at the beginning of LEB */
+                        if (snod->offs != 0) {
+                                ubifs_err("unexpected node in log");
+                                goto out_dump;
+                        }
+                        break;
+                default:
+                        ubifs_err("unexpected node in log");
+                        goto out_dump;
+                }
+        }
+        if (sleb->endpt || c->lhead_offs >= c->leb_size) {
+                c->lhead_lnum = lnum;
+                c->lhead_offs = sleb->endpt;
+        }
+        err = !sleb->endpt;
+out:
+        ubifs_scan_destroy(sleb);
+        return err;
+out_dump:
+        ubifs_err("log error detected while replying the log at LEB %d:%d",
+                  lnum, offs + snod->offs);
+        dbg_dump_node(c, snod->node);
+        ubifs_scan_destroy(sleb);
+        return -EINVAL;
+}
+/**
+ * take_ihead - update the status of the index head in lprops to 'taken'.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns the amount of free space in the index head LEB or a
+ * negative error code.
+ */
+static int take_ihead(struct ubifs_info *c)
+{
+        const struct ubifs_lprops *lp;
+        int err, free;
+        ubifs_get_lprops(c);
+        lp = ubifs_lpt_lookup_dirty(c, c->ihead_lnum);
+        if (IS_ERR(lp)) {
+                err = PTR_ERR(lp);
+                goto out;
+        }
+        free = lp->free;
+        lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC,
+                             lp->flags | LPROPS_TAKEN, 0);
+        if (IS_ERR(lp)) {
+                err = PTR_ERR(lp);
+                goto out;
+        }
+        err = free;
+out:
+        ubifs_release_lprops(c);
+        return err;
+}
+/**
+ * ubifs_replay_journal - replay journal.
+ * @c: UBIFS file-system description object
+ *
+ * This function scans the journal, replays and cleans it up. It makes sure all
+ * memory data structures related to uncommitted journal are built (dirty TNC
+ * tree, tree of buds, modified lprops, etc).
+ */
+int ubifs_replay_journal(struct ubifs_info *c)
+{
+        int err, i, lnum, offs, free;
+        void *sbuf = NULL;
+        BUILD_BUG_ON(UBIFS_TRUN_KEY > 5);
+        /* Update the status of the index head in lprops to 'taken' */
+        free = take_ihead(c);
+        if (free < 0)
+                return free; /* Error code */
+        if (c->ihead_offs != c->leb_size - free) {
+                ubifs_err("bad index head LEB %d:%d", c->ihead_lnum,
+                          c->ihead_offs);
+                return -EINVAL;
+        }
+        sbuf = vmalloc(c->leb_size);
+        if (!sbuf)
+                return -ENOMEM;
+        dbg_mnt("start replaying the journal");
+        c->replaying = 1;
+        lnum = c->ltail_lnum = c->lhead_lnum;
+        offs = c->lhead_offs;
+        for (i = 0; i < c->log_lebs; i++, lnum++) {
+                if (lnum >= UBIFS_LOG_LNUM + c->log_lebs) {
+                        /*
+                         * The log is logically circular, we reached the last
+                         * LEB, switch to the first one.
+                         */
+                        lnum = UBIFS_LOG_LNUM;
+                        offs = 0;
+                }
+                err = replay_log_leb(c, lnum, offs, sbuf);
+                if (err == 1)
+                        /* We hit the end of the log */
+                        break;
+                if (err)
+                        goto out;
+                offs = 0;
+        }
+        err = replay_buds(c);
+        if (err)
+                goto out;
+        err = apply_replay_tree(c);
+        if (err)
+                goto out;
+        ubifs_assert(c->bud_bytes <= c->max_bud_bytes || c->need_recovery);
+        dbg_mnt("finished, log head LEB %d:%d, max_sqnum %llu, "
+                "highest_inum %lu", c->lhead_lnum, c->lhead_offs, c->max_sqnum,
+                c->highest_inum);
+out:
+        destroy_replay_tree(c);
+        destroy_bud_list(c);
+        vfree(sbuf);
+        c->replaying = 0;
+        return err;
+}
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
new file mode 100644
index 000000000000..2bf753b38889
--- /dev/null
+++ b/fs/ubifs/sb.c
@@ -0,0 +1,629 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+/*
+ * This file implements UBIFS superblock. The superblock is stored at the first
+ * LEB of the volume and is never changed by UBIFS. Only user-space tools may
+ * change it. The superblock node mostly contains geometry information.
+ */
+#include "ubifs.h"
+#include <linux/random.h>
+/*
+ * Default journal size in logical eraseblocks as a percent of total
+ * flash size.
+ */
+#define DEFAULT_JNL_PERCENT 5
+/* Default maximum journal size in bytes */
+#define DEFAULT_MAX_JNL (32*1024*1024)
+/* Default indexing tree fanout */
+#define DEFAULT_FANOUT 8
+/* Default number of data journal heads */
+#define DEFAULT_JHEADS_CNT 1
+/* Default positions of different LEBs in the main area */
+#define DEFAULT_IDX_LEB  0
+#define DEFAULT_DATA_LEB 1
+#define DEFAULT_GC_LEB   2
+/* Default number of LEB numbers in LPT's save table */
+#define DEFAULT_LSAVE_CNT 256
+/* Default reserved pool size as a percent of maximum free space */
+#define DEFAULT_RP_PERCENT 5
+/* The default maximum size of reserved pool in bytes */
+#define DEFAULT_MAX_RP_SIZE (5*1024*1024)
+/* Default time granularity in nanoseconds */
+#define DEFAULT_TIME_GRAN 1000000000
+/**
+ * create_default_filesystem - format empty UBI volume.
+ * @c: UBIFS file-system description object
+ *
+ * This function creates default empty file-system. Returns zero in case of
+ * success and a negative error code in case of failure.
+ */
+static int create_default_filesystem(struct ubifs_info *c)
+{
+        struct ubifs_sb_node *sup;
+        struct ubifs_mst_node *mst;
+        struct ubifs_idx_node *idx;
+        struct ubifs_branch *br;
+        struct ubifs_ino_node *ino;
+        struct ubifs_cs_node *cs;
+        union ubifs_key key;
+        int err, tmp, jnl_lebs, log_lebs, max_buds, main_lebs, main_first;
+        int lpt_lebs, lpt_first, orph_lebs, big_lpt, ino_waste, sup_flags = 0;
+        int min_leb_cnt = UBIFS_MIN_LEB_CNT;
+        uint64_t tmp64, main_bytes;
+        /* Some functions called from here depend on the @c->key_len filed */
+        c->key_len = UBIFS_SK_LEN;
+        /*
+         * First of all, we have to calculate default file-system geometry -
+         * log size, journal size, etc.
+         */
+        if (c->leb_cnt < 0x7FFFFFFF / DEFAULT_JNL_PERCENT)
+                /* We can first multiply then divide and have no overflow */
+                jnl_lebs = c->leb_cnt * DEFAULT_JNL_PERCENT / 100;
+        else
+                jnl_lebs = (c->leb_cnt / 100) * DEFAULT_JNL_PERCENT;
+        if (jnl_lebs < UBIFS_MIN_JNL_LEBS)
+                jnl_lebs = UBIFS_MIN_JNL_LEBS;
+        if (jnl_lebs * c->leb_size > DEFAULT_MAX_JNL)
+                jnl_lebs = DEFAULT_MAX_JNL / c->leb_size;
+        /*
+         * The log should be large enough to fit reference nodes for all bud
+         * LEBs. Because buds do not have to start from the beginning of LEBs
+         * (half of the LEB may contain committed data), the log should
+         * generally be larger, make it twice as large.
+         */
+        tmp = 2 * (c->ref_node_alsz * jnl_lebs) + c->leb_size - 1;
+        log_lebs = tmp / c->leb_size;
+        /* Plus one LEB reserved for commit */
+        log_lebs += 1;
+        if (c->leb_cnt - min_leb_cnt > 8) {
+                /* And some extra space to allow writes while committing */
+                log_lebs += 1;
+                min_leb_cnt += 1;
+        }
+        max_buds = jnl_lebs - log_lebs;
+        if (max_buds < UBIFS_MIN_BUD_LEBS)
+                max_buds = UBIFS_MIN_BUD_LEBS;
+        /*
+         * Orphan nodes are stored in a separate area. One node can store a lot
+         * of orphan inode numbers, but when new orphan comes we just add a new
+         * orphan node. At some point the nodes are consolidated into one
+         * orphan node.
+         */
+        orph_lebs = UBIFS_MIN_ORPH_LEBS;
+#ifdef CONFIG_UBIFS_FS_DEBUG
+        if (c->leb_cnt - min_leb_cnt > 1)
+                /*
+                 * For debugging purposes it is better to have at least 2
+                 * orphan LEBs, because the orphan subsystem would need to do
+                 * consolidations and would be stressed more.
+                 */
+                orph_lebs += 1;
+#endif
+        main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS - log_lebs;
+        main_lebs -= orph_lebs;
+        lpt_first = UBIFS_LOG_LNUM + log_lebs;
+        c->lsave_cnt = DEFAULT_LSAVE_CNT;
+        c->max_leb_cnt = c->leb_cnt;
+        err = ubifs_create_dflt_lpt(c, &main_lebs, lpt_first, &lpt_lebs,
+                                    &big_lpt);
+        if (err)
+                return err;
+        dbg_gen("LEB Properties Tree created (LEBs %d-%d)", lpt_first,
+                lpt_first + lpt_lebs - 1);
+        main_first = c->leb_cnt - main_lebs;
+        /* Create default superblock */
+        tmp = ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size);
+        sup = kzalloc(tmp, GFP_KERNEL);
+        if (!sup)
+                return -ENOMEM;
+        tmp64 = (uint64_t)max_buds * c->leb_size;
+        if (big_lpt)
+                sup_flags |= UBIFS_FLG_BIGLPT;
+        sup->ch.node_type  = UBIFS_SB_NODE;
+        sup->key_hash      = UBIFS_KEY_HASH_R5;
+        sup->flags         = cpu_to_le32(sup_flags);
+        sup->min_io_size   = cpu_to_le32(c->min_io_size);
+        sup->leb_size      = cpu_to_le32(c->leb_size);
+        sup->leb_cnt       = cpu_to_le32(c->leb_cnt);
+        sup->max_leb_cnt   = cpu_to_le32(c->max_leb_cnt);
+        sup->max_bud_bytes = cpu_to_le64(tmp64);
+        sup->log_lebs      = cpu_to_le32(log_lebs);
+        sup->lpt_lebs      = cpu_to_le32(lpt_lebs);
+        sup->orph_lebs     = cpu_to_le32(orph_lebs);
+        sup->jhead_cnt     = cpu_to_le32(DEFAULT_JHEADS_CNT);
+        sup->fanout        = cpu_to_le32(DEFAULT_FANOUT);
+        sup->lsave_cnt     = cpu_to_le32(c->lsave_cnt);
+        sup->fmt_version   = cpu_to_le32(UBIFS_FORMAT_VERSION);
+        sup->default_compr = cpu_to_le16(UBIFS_COMPR_LZO);
+        sup->time_gran     = cpu_to_le32(DEFAULT_TIME_GRAN);
+        generate_random_uuid(sup->uuid);
+        main_bytes = (uint64_t)main_lebs * c->leb_size;
+        tmp64 = main_bytes * DEFAULT_RP_PERCENT;
+        do_div(tmp64, 100);
+        if (tmp64 > DEFAULT_MAX_RP_SIZE)
+                tmp64 = DEFAULT_MAX_RP_SIZE;
+        sup->rp_size = cpu_to_le64(tmp64);
+        err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0, UBI_LONGTERM);
+        kfree(sup);
+        if (err)
+                return err;
+        dbg_gen("default superblock created at LEB 0:0");
+        /* Create default master node */
+        mst = kzalloc(c->mst_node_alsz, GFP_KERNEL);
+        if (!mst)
+                return -ENOMEM;
+        mst->ch.node_type = UBIFS_MST_NODE;
+        mst->log_lnum     = cpu_to_le32(UBIFS_LOG_LNUM);
+        mst->highest_inum = cpu_to_le64(UBIFS_FIRST_INO);
+        mst->cmt_no       = 0;
+        mst->root_lnum    = cpu_to_le32(main_first + DEFAULT_IDX_LEB);
+        mst->root_offs    = 0;
+        tmp = ubifs_idx_node_sz(c, 1);
+        mst->root_len     = cpu_to_le32(tmp);
+        mst->gc_lnum      = cpu_to_le32(main_first + DEFAULT_GC_LEB);
+        mst->ihead_lnum   = cpu_to_le32(main_first + DEFAULT_IDX_LEB);
+        mst->ihead_offs   = cpu_to_le32(ALIGN(tmp, c->min_io_size));
+        mst->index_size   = cpu_to_le64(ALIGN(tmp, 8));
+        mst->lpt_lnum     = cpu_to_le32(c->lpt_lnum);
+        mst->lpt_offs     = cpu_to_le32(c->lpt_offs);
+        mst->nhead_lnum   = cpu_to_le32(c->nhead_lnum);
+        mst->nhead_offs   = cpu_to_le32(c->nhead_offs);
+        mst->ltab_lnum    = cpu_to_le32(c->ltab_lnum);
+        mst->ltab_offs    = cpu_to_le32(c->ltab_offs);
+        mst->lsave_lnum   = cpu_to_le32(c->lsave_lnum);
+        mst->lsave_offs   = cpu_to_le32(c->lsave_offs);
+        mst->lscan_lnum   = cpu_to_le32(main_first);
+        mst->empty_lebs   = cpu_to_le32(main_lebs - 2);
+        mst->idx_lebs     = cpu_to_le32(1);
+        mst->leb_cnt      = cpu_to_le32(c->leb_cnt);
+        /* Calculate lprops statistics */
+        tmp64 = main_bytes;
+        tmp64 -= ALIGN(ubifs_idx_node_sz(c, 1), c->min_io_size);
+        tmp64 -= ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size);
+        mst->total_free = cpu_to_le64(tmp64);
+        tmp64 = ALIGN(ubifs_idx_node_sz(c, 1), c->min_io_size);
+        ino_waste = ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size) -
+                          UBIFS_INO_NODE_SZ;
+        tmp64 += ino_waste;
+        tmp64 -= ALIGN(ubifs_idx_node_sz(c, 1), 8);
+        mst->total_dirty = cpu_to_le64(tmp64);
+        /*  The indexing LEB does not contribute to dark space */
+        tmp64 = (c->main_lebs - 1) * c->dark_wm;
+        mst->total_dark = cpu_to_le64(tmp64);
+        mst->total_used = cpu_to_le64(UBIFS_INO_NODE_SZ);
+        err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM, 0,
+                               UBI_UNKNOWN);
+        if (err) {
+                kfree(mst);
+                return err;
+        }
+        err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM + 1, 0,
+                               UBI_UNKNOWN);
+        kfree(mst);
+        if (err)
+                return err;
+        dbg_gen("default master node created at LEB %d:0", UBIFS_MST_LNUM);
+        /* Create the root indexing node */
+        tmp = ubifs_idx_node_sz(c, 1);
+        idx = kzalloc(ALIGN(tmp, c->min_io_size), GFP_KERNEL);
+        if (!idx)
+                return -ENOMEM;
+        c->key_fmt = UBIFS_SIMPLE_KEY_FMT;
+        c->key_hash = key_r5_hash;
+        idx->ch.node_type = UBIFS_IDX_NODE;
+        idx->child_cnt = cpu_to_le16(1);
+        ino_key_init(c, &key, UBIFS_ROOT_INO);
+        br = ubifs_idx_branch(c, idx, 0);
+        key_write_idx(c, &key, &br->key);
+        br->lnum = cpu_to_le32(main_first + DEFAULT_DATA_LEB);
+        br->len  = cpu_to_le32(UBIFS_INO_NODE_SZ);
+        err = ubifs_write_node(c, idx, tmp, main_first + DEFAULT_IDX_LEB, 0,
+                               UBI_UNKNOWN);
+        kfree(idx);
+        if (err)
+                return err;
+        dbg_gen("default root indexing node created LEB %d:0",
+                main_first + DEFAULT_IDX_LEB);
+        /* Create default root inode */
+        tmp = ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size);
+        ino = kzalloc(tmp, GFP_KERNEL);
+        if (!ino)
+                return -ENOMEM;
+        ino_key_init_flash(c, &ino->key, UBIFS_ROOT_INO);
+        ino->ch.node_type = UBIFS_INO_NODE;
+        ino->creat_sqnum = cpu_to_le64(++c->max_sqnum);
+        ino->nlink = cpu_to_le32(2);
+        tmp = cpu_to_le64(CURRENT_TIME_SEC.tv_sec);
+        ino->atime_sec   = tmp;
+        ino->ctime_sec   = tmp;
+        ino->mtime_sec   = tmp;
+        ino->atime_nsec  = 0;
+        ino->ctime_nsec  = 0;
+        ino->mtime_nsec  = 0;
+        ino->mode = cpu_to_le32(S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO);
+        ino->size = cpu_to_le64(UBIFS_INO_NODE_SZ);
+        /* Set compression enabled by default */
+        ino->flags = cpu_to_le32(UBIFS_COMPR_FL);
+        err = ubifs_write_node(c, ino, UBIFS_INO_NODE_SZ,
+                               main_first + DEFAULT_DATA_LEB, 0,
+                               UBI_UNKNOWN);
+        kfree(ino);
+        if (err)
+                return err;
+        dbg_gen("root inode created at LEB %d:0",
+                main_first + DEFAULT_DATA_LEB);
+        /*
+         * The first node in the log has to be the commit start node. This is
+         * always the case during normal file-system operation. Write a fake
+         * commit start node to the log.
+         */
+        tmp = ALIGN(UBIFS_CS_NODE_SZ, c->min_io_size);
+        cs = kzalloc(tmp, GFP_KERNEL);
+        if (!cs)
+                return -ENOMEM;
+        cs->ch.node_type = UBIFS_CS_NODE;
+        err = ubifs_write_node(c, cs, UBIFS_CS_NODE_SZ, UBIFS_LOG_LNUM,
+                               0, UBI_UNKNOWN);
+        kfree(cs);
+        ubifs_msg("default file-system created");
+        return 0;
+}
+/**
+ * validate_sb - validate superblock node.
+ * @c: UBIFS file-system description object
+ * @sup: superblock node
+ *
+ * This function validates superblock node @sup. Since most of data was read
+ * from the superblock and stored in @c, the function validates fields in @c
+ * instead. Returns zero in case of success and %-EINVAL in case of validation
+ * failure.
+ */
+static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup)
+{
+        long long max_bytes;
+        int err = 1, min_leb_cnt;
+        if (!c->key_hash) {
+                err = 2;
+                goto failed;
+        }
+        if (sup->key_fmt != UBIFS_SIMPLE_KEY_FMT) {
+                err = 3;
+                goto failed;
+        }
+        if (le32_to_cpu(sup->min_io_size) != c->min_io_size) {
+                ubifs_err("min. I/O unit mismatch: %d in superblock, %d real",
+                          le32_to_cpu(sup->min_io_size), c->min_io_size);
+                goto failed;
+        }
+        if (le32_to_cpu(sup->leb_size) != c->leb_size) {
+                ubifs_err("LEB size mismatch: %d in superblock, %d real",
+                          le32_to_cpu(sup->leb_size), c->leb_size);
+                goto failed;
+        }
+        if (c->log_lebs < UBIFS_MIN_LOG_LEBS ||
+            c->lpt_lebs < UBIFS_MIN_LPT_LEBS ||
+            c->orph_lebs < UBIFS_MIN_ORPH_LEBS ||
+            c->main_lebs < UBIFS_MIN_MAIN_LEBS) {
+                err = 4;
+                goto failed;
+        }
+        /*
+         * Calculate minimum allowed amount of main area LEBs. This is very
+         * similar to %UBIFS_MIN_LEB_CNT, but we take into account real what we
+         * have just read from the superblock.
+         */
+        min_leb_cnt = UBIFS_SB_LEBS + UBIFS_MST_LEBS + c->log_lebs;
+        min_leb_cnt += c->lpt_lebs + c->orph_lebs + c->jhead_cnt + 6;
+        if (c->leb_cnt < min_leb_cnt || c->leb_cnt > c->vi.size) {
+                ubifs_err("bad LEB count: %d in superblock, %d on UBI volume, "
+                          "%d minimum required", c->leb_cnt, c->vi.size,
+                          min_leb_cnt);
+                goto failed;
+        }
+        if (c->max_leb_cnt < c->leb_cnt) {
+                ubifs_err("max. LEB count %d less than LEB count %d",
+                          c->max_leb_cnt, c->leb_cnt);
+                goto failed;
+        }
+        if (c->main_lebs < UBIFS_MIN_MAIN_LEBS) {
+                err = 7;
+                goto failed;
+        }
+        if (c->max_bud_bytes < (long long)c->leb_size * UBIFS_MIN_BUD_LEBS ||
+            c->max_bud_bytes > (long long)c->leb_size * c->main_lebs) {
+                err = 8;
+                goto failed;
+        }
+        if (c->jhead_cnt < NONDATA_JHEADS_CNT + 1 ||
+            c->jhead_cnt > NONDATA_JHEADS_CNT + UBIFS_MAX_JHEADS) {
+                err = 9;
+                goto failed;
+        }
+        if (c->fanout < UBIFS_MIN_FANOUT ||
+            ubifs_idx_node_sz(c, c->fanout) > c->leb_size) {
+                err = 10;
+                goto failed;
+        }
+        if (c->lsave_cnt < 0 || (c->lsave_cnt > DEFAULT_LSAVE_CNT &&
+            c->lsave_cnt > c->max_leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS -
+            c->log_lebs - c->lpt_lebs - c->orph_lebs)) {
+                err = 11;
+                goto failed;
+        }
+        if (UBIFS_SB_LEBS + UBIFS_MST_LEBS + c->log_lebs + c->lpt_lebs +
+            c->orph_lebs + c->main_lebs != c->leb_cnt) {
+                err = 12;
+                goto failed;
+        }
+        if (c->default_compr < 0 || c->default_compr >= UBIFS_COMPR_TYPES_CNT) {
+                err = 13;
+                goto failed;
+        }
+        max_bytes = c->main_lebs * (long long)c->leb_size;
+        if (c->rp_size < 0 || max_bytes < c->rp_size) {
+                err = 14;
+                goto failed;
+        }
+        if (le32_to_cpu(sup->time_gran) > 1000000000 ||
+            le32_to_cpu(sup->time_gran) < 1) {
+                err = 15;
+                goto failed;
+        }
+        return 0;
+failed:
+        ubifs_err("bad superblock, error %d", err);
+        dbg_dump_node(c, sup);
+        return -EINVAL;
+}
+/**
+ * ubifs_read_sb_node - read superblock node.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns a pointer to the superblock node or a negative error
+ * code.
+ */
+struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c)
+{
+        struct ubifs_sb_node *sup;
+        int err;
+        sup = kmalloc(ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size), GFP_NOFS);
+        if (!sup)
+                return ERR_PTR(-ENOMEM);
+        err = ubifs_read_node(c, sup, UBIFS_SB_NODE, UBIFS_SB_NODE_SZ,
+                              UBIFS_SB_LNUM, 0);
+        if (err) {
+                kfree(sup);
+                return ERR_PTR(err);
+        }
+        return sup;
+}
+/**
+ * ubifs_write_sb_node - write superblock node.
+ * @c: UBIFS file-system description object
+ * @sup: superblock node read with 'ubifs_read_sb_node()'
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup)
+{
+        int len = ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size);
+        ubifs_prepare_node(c, sup, UBIFS_SB_NODE_SZ, 1);
+        return ubifs_leb_change(c, UBIFS_SB_LNUM, sup, len, UBI_LONGTERM);
+}
+/**
+ * ubifs_read_superblock - read superblock.
+ * @c: UBIFS file-system description object
+ *
+ * This function finds, reads and checks the superblock. If an empty UBI volume
+ * is being mounted, this function creates default superblock. Returns zero in
+ * case of success, and a negative error code in case of failure.
+ */
+int ubifs_read_superblock(struct ubifs_info *c)
+{
+        int err, sup_flags;
+        struct ubifs_sb_node *sup;
+        if (c->empty) {
+                err = create_default_filesystem(c);
+                if (err)
+                        return err;
+        }
+        sup = ubifs_read_sb_node(c);
+        if (IS_ERR(sup))
+                return PTR_ERR(sup);
+        /*
+         * The software supports all previous versions but not future versions,
+         * due to the unavailability of time-travelling equipment.
+         */
+        c->fmt_version = le32_to_cpu(sup->fmt_version);
+        if (c->fmt_version > UBIFS_FORMAT_VERSION) {
+                ubifs_err("on-flash format version is %d, but software only "
+                          "supports up to version %d", c->fmt_version,
+                          UBIFS_FORMAT_VERSION);
+                err = -EINVAL;
+                goto out;
+        }
+        if (c->fmt_version < 3) {
+                ubifs_err("on-flash format version %d is not supported",
+                          c->fmt_version);
+                err = -EINVAL;
+                goto out;
+        }
+        switch (sup->key_hash) {
+        case UBIFS_KEY_HASH_R5:
+                c->key_hash = key_r5_hash;
+                c->key_hash_type = UBIFS_KEY_HASH_R5;
+                break;
+        case UBIFS_KEY_HASH_TEST:
+                c->key_hash = key_test_hash;
+                c->key_hash_type = UBIFS_KEY_HASH_TEST;
+                break;
+        };
+        c->key_fmt = sup->key_fmt;
+        switch (c->key_fmt) {
+        case UBIFS_SIMPLE_KEY_FMT:
+                c->key_len = UBIFS_SK_LEN;
+                break;
+        default:
+                ubifs_err("unsupported key format");
+                err = -EINVAL;
+                goto out;
+        }
+        c->leb_cnt       = le32_to_cpu(sup->leb_cnt);
+        c->max_leb_cnt   = le32_to_cpu(sup->max_leb_cnt);
+        c->max_bud_bytes = le64_to_cpu(sup->max_bud_bytes);
+        c->log_lebs      = le32_to_cpu(sup->log_lebs);
+        c->lpt_lebs      = le32_to_cpu(sup->lpt_lebs);
+        c->orph_lebs     = le32_to_cpu(sup->orph_lebs);
+        c->jhead_cnt     = le32_to_cpu(sup->jhead_cnt) + NONDATA_JHEADS_CNT;
+        c->fanout        = le32_to_cpu(sup->fanout);
+        c->lsave_cnt     = le32_to_cpu(sup->lsave_cnt);
+        c->default_compr = le16_to_cpu(sup->default_compr);
+        c->rp_size       = le64_to_cpu(sup->rp_size);
+        c->rp_uid        = le32_to_cpu(sup->rp_uid);
+        c->rp_gid        = le32_to_cpu(sup->rp_gid);
+        sup_flags        = le32_to_cpu(sup->flags);
+        c->vfs_sb->s_time_gran = le32_to_cpu(sup->time_gran);
+        memcpy(&c->uuid, &sup->uuid, 16);
+        c->big_lpt = !!(sup_flags & UBIFS_FLG_BIGLPT);
+        /* Automatically increase file system size to the maximum size */
+        c->old_leb_cnt = c->leb_cnt;
+        if (c->leb_cnt < c->vi.size && c->leb_cnt < c->max_leb_cnt) {
+                c->leb_cnt = min_t(int, c->max_leb_cnt, c->vi.size);
+                if (c->vfs_sb->s_flags & MS_RDONLY)
+                        dbg_mnt("Auto resizing (ro) from %d LEBs to %d LEBs",
+                                c->old_leb_cnt, c->leb_cnt);
+                else {
+                        dbg_mnt("Auto resizing (sb) from %d LEBs to %d LEBs",
+                                c->old_leb_cnt, c->leb_cnt);
+                        sup->leb_cnt = cpu_to_le32(c->leb_cnt);
+                        err = ubifs_write_sb_node(c, sup);
+                        if (err)
+                                goto out;
+                        c->old_leb_cnt = c->leb_cnt;
+                }
+        }
+        c->log_bytes = (long long)c->log_lebs * c->leb_size;
+        c->log_last = UBIFS_LOG_LNUM + c->log_lebs - 1;
+        c->lpt_first = UBIFS_LOG_LNUM + c->log_lebs;
+        c->lpt_last = c->lpt_first + c->lpt_lebs - 1;
+        c->orph_first = c->lpt_last + 1;
+        c->orph_last = c->orph_first + c->orph_lebs - 1;
+        c->main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS;
+        c->main_lebs -= c->log_lebs + c->lpt_lebs + c->orph_lebs;
+        c->main_first = c->leb_cnt - c->main_lebs;
+        c->report_rp_size = ubifs_reported_space(c, c->rp_size);
+        err = validate_sb(c, sup);
+out:
+        kfree(sup);
+        return err;
+}
diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c
new file mode 100644
index 000000000000..acf5c5fffc60
--- /dev/null
+++ b/fs/ubifs/scan.c
@@ -0,0 +1,362 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+/*
+ * This file implements the scan which is a general-purpose function for
+ * determining what nodes are in an eraseblock. The scan is used to replay the
+ * journal, to do garbage collection. for the TNC in-the-gaps method, and by
+ * debugging functions.
+ */
+#include "ubifs.h"
+/**
+ * scan_padding_bytes - scan for padding bytes.
+ * @buf: buffer to scan
+ * @len: length of buffer
+ *
+ * This function returns the number of padding bytes on success and
+ * %SCANNED_GARBAGE on failure.
+ */
+static int scan_padding_bytes(void *buf, int len)
+{
+        int pad_len = 0, max_pad_len = min_t(int, UBIFS_PAD_NODE_SZ, len);
+        uint8_t *p = buf;
+        dbg_scan("not a node");
+        while (pad_len < max_pad_len && *p++ == UBIFS_PADDING_BYTE)
+                pad_len += 1;
+        if (!pad_len || (pad_len & 7))
+                return SCANNED_GARBAGE;
+        dbg_scan("%d padding bytes", pad_len);
+        return pad_len;
+}
+/**
+ * ubifs_scan_a_node - scan for a node or padding.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to scan
+ * @len: length of buffer
+ * @lnum: logical eraseblock number
+ * @offs: offset within the logical eraseblock
+ * @quiet: print no messages
+ *
+ * This function returns a scanning code to indicate what was scanned.
+ */
+int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum,
+                      int offs, int quiet)
+{
+        struct ubifs_ch *ch = buf;
+        uint32_t magic;
+        magic = le32_to_cpu(ch->magic);
+        if (magic == 0xFFFFFFFF) {
+                dbg_scan("hit empty space");
+                return SCANNED_EMPTY_SPACE;
+        }
+        if (magic != UBIFS_NODE_MAGIC)
+                return scan_padding_bytes(buf, len);
+        if (len < UBIFS_CH_SZ)
+                return SCANNED_GARBAGE;
+        dbg_scan("scanning %s", dbg_ntype(ch->node_type));
+        if (ubifs_check_node(c, buf, lnum, offs, quiet))
+                return SCANNED_A_CORRUPT_NODE;
+        if (ch->node_type == UBIFS_PAD_NODE) {
+                struct ubifs_pad_node *pad = buf;
+                int pad_len = le32_to_cpu(pad->pad_len);
+                int node_len = le32_to_cpu(ch->len);
+                /* Validate the padding node */
+                if (pad_len < 0 ||
+                    offs + node_len + pad_len > c->leb_size) {
+                        if (!quiet) {
+                                ubifs_err("bad pad node at LEB %d:%d",
+                                          lnum, offs);
+                                dbg_dump_node(c, pad);
+                        }
+                        return SCANNED_A_BAD_PAD_NODE;
+                }
+                /* Make the node pads to 8-byte boundary */
+                if ((node_len + pad_len) & 7) {
+                        if (!quiet) {
+                                dbg_err("bad padding length %d - %d",
+                                        offs, offs + node_len + pad_len);
+                        }
+                        return SCANNED_A_BAD_PAD_NODE;
+                }
+                dbg_scan("%d bytes padded, offset now %d",
+                         pad_len, ALIGN(offs + node_len + pad_len, 8));
+                return node_len + pad_len;
+        }
+        return SCANNED_A_NODE;
+}
+/**
+ * ubifs_start_scan - create LEB scanning information at start of scan.
+ * @c: UBIFS file-system description object
+ * @lnum: logical eraseblock number
+ * @offs: offset to start at (usually zero)
+ * @sbuf: scan buffer (must be c->leb_size)
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum,
+                                        int offs, void *sbuf)
+{
+        struct ubifs_scan_leb *sleb;
+        int err;
+        dbg_scan("scan LEB %d:%d", lnum, offs);
+        sleb = kzalloc(sizeof(struct ubifs_scan_leb), GFP_NOFS);
+        if (!sleb)
+                return ERR_PTR(-ENOMEM);
+        sleb->lnum = lnum;
+        INIT_LIST_HEAD(&sleb->nodes);
+        sleb->buf = sbuf;
+        err = ubi_read(c->ubi, lnum, sbuf + offs, offs, c->leb_size - offs);
+        if (err && err != -EBADMSG) {
+                ubifs_err("cannot read %d bytes from LEB %d:%d,"
+                          " error %d", c->leb_size - offs, lnum, offs, err);
+                kfree(sleb);
+                return ERR_PTR(err);
+        }
+        if (err == -EBADMSG)
+                sleb->ecc = 1;
+        return sleb;
+}
+/**
+ * ubifs_end_scan - update LEB scanning information at end of scan.
+ * @c: UBIFS file-system description object
+ * @sleb: scanning information
+ * @lnum: logical eraseblock number
+ * @offs: offset to start at (usually zero)
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+void ubifs_end_scan(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+                    int lnum, int offs)
+{
+        lnum = lnum;
+        dbg_scan("stop scanning LEB %d at offset %d", lnum, offs);
+        ubifs_assert(offs % c->min_io_size == 0);
+        sleb->endpt = ALIGN(offs, c->min_io_size);
+}
+/**
+ * ubifs_add_snod - add a scanned node to LEB scanning information.
+ * @c: UBIFS file-system description object
+ * @sleb: scanning information
+ * @buf: buffer containing node
+ * @offs: offset of node on flash
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_add_snod(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+                   void *buf, int offs)
+{
+        struct ubifs_ch *ch = buf;
+        struct ubifs_ino_node *ino = buf;
+        struct ubifs_scan_node *snod;
+        snod = kzalloc(sizeof(struct ubifs_scan_node), GFP_NOFS);
+        if (!snod)
+                return -ENOMEM;
+        snod->sqnum = le64_to_cpu(ch->sqnum);
+        snod->type = ch->node_type;
+        snod->offs = offs;
+        snod->len = le32_to_cpu(ch->len);
+        snod->node = buf;
+        switch (ch->node_type) {
+        case UBIFS_INO_NODE:
+        case UBIFS_DENT_NODE:
+        case UBIFS_XENT_NODE:
+        case UBIFS_DATA_NODE:
+        case UBIFS_TRUN_NODE:
+                /*
+                 * The key is in the same place in all keyed
+                 * nodes.
+                 */
+                key_read(c, &ino->key, &snod->key);
+                break;
+        }
+        list_add_tail(&snod->list, &sleb->nodes);
+        sleb->nodes_cnt += 1;
+        return 0;
+}
+/**
+ * ubifs_scanned_corruption - print information after UBIFS scanned corruption.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number of corruption
+ * @offs: offset of corruption
+ * @buf: buffer containing corruption
+ */
+void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs,
+                              void *buf)
+{
+        int len;
+        ubifs_err("corrupted data at LEB %d:%d", lnum, offs);
+        if (dbg_failure_mode)
+                return;
+        len = c->leb_size - offs;
+        if (len > 4096)
+                len = 4096;
+        dbg_err("first %d bytes from LEB %d:%d", len, lnum, offs);
+        print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 4, buf, len, 1);
+}
+/**
+ * ubifs_scan - scan a logical eraseblock.
+ * @c: UBIFS file-system description object
+ * @lnum: logical eraseblock number
+ * @offs: offset to start at (usually zero)
+ * @sbuf: scan buffer (must be c->leb_size)
+ *
+ * This function scans LEB number @lnum and returns complete information about
+ * its contents. Returns an error code in case of failure.
+ */
+struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
+                                  int offs, void *sbuf)
+{
+        void *buf = sbuf + offs;
+        int err, len = c->leb_size - offs;
+        struct ubifs_scan_leb *sleb;
+        sleb = ubifs_start_scan(c, lnum, offs, sbuf);
+        if (IS_ERR(sleb))
+                return sleb;
+        while (len >= 8) {
+                struct ubifs_ch *ch = buf;
+                int node_len, ret;
+                dbg_scan("look at LEB %d:%d (%d bytes left)",
+                         lnum, offs, len);
+                cond_resched();
+                ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 0);
+                if (ret > 0) {
+                        /* Padding bytes or a valid padding node */
+                        offs += ret;
+                        buf += ret;
+                        len -= ret;
+                        continue;
+                }
+                if (ret == SCANNED_EMPTY_SPACE)
+                        /* Empty space is checked later */
+                        break;
+                switch (ret) {
+                case SCANNED_GARBAGE:
+                        dbg_err("garbage");
+                        goto corrupted;
+                case SCANNED_A_NODE:
+                        break;
+                case SCANNED_A_CORRUPT_NODE:
+                case SCANNED_A_BAD_PAD_NODE:
+                        dbg_err("bad node");
+                        goto corrupted;
+                default:
+                        dbg_err("unknown");
+                        goto corrupted;
+                }
+                err = ubifs_add_snod(c, sleb, buf, offs);
+                if (err)
+                        goto error;
+                node_len = ALIGN(le32_to_cpu(ch->len), 8);
+                offs += node_len;
+                buf += node_len;
+                len -= node_len;
+        }
+        if (offs % c->min_io_size)
+                goto corrupted;
+        ubifs_end_scan(c, sleb, lnum, offs);
+        for (; len > 4; offs += 4, buf = buf + 4, len -= 4)
+                if (*(uint32_t *)buf != 0xffffffff)
+                        break;
+        for (; len; offs++, buf++, len--)
+                if (*(uint8_t *)buf != 0xff) {
+                        ubifs_err("corrupt empty space at LEB %d:%d",
+                                  lnum, offs);
+                        goto corrupted;
+                }
+        return sleb;
+corrupted:
+        ubifs_scanned_corruption(c, lnum, offs, buf);
+        err = -EUCLEAN;
+error:
+        ubifs_err("LEB %d scanning failed", lnum);
+        ubifs_scan_destroy(sleb);
+        return ERR_PTR(err);
+}
+/**
+ * ubifs_scan_destroy - destroy LEB scanning information.
+ * @sleb: scanning information to free
+ */
+void ubifs_scan_destroy(struct ubifs_scan_leb *sleb)
+{
+        struct ubifs_scan_node *node;
+        struct list_head *head;
+        head = &sleb->nodes;
+        while (!list_empty(head)) {
+                node = list_entry(head->next, struct ubifs_scan_node, list);
+                list_del(&node->list);
+                kfree(node);
+        }
+        kfree(sleb);
+}
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
new file mode 100644
index 000000000000..f248533841a2
--- /dev/null
+++ b/fs/ubifs/shrinker.c
@@ -0,0 +1,322 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+/*
+ * This file implements UBIFS shrinker which evicts clean znodes from the TNC
+ * tree when Linux VM needs more RAM.
+ *
+ * We do not implement any LRU lists to find oldest znodes to free because it
+ * would add additional overhead to the file system fast paths. So the shrinker
+ * just walks the TNC tree when searching for znodes to free.
+ *
+ * If the root of a TNC sub-tree is clean and old enough, then the children are
+ * also clean and old enough. So the shrinker walks the TNC in level order and
+ * dumps entire sub-trees.
+ *
+ * The age of znodes is just the time-stamp when they were last looked at.
+ * The current shrinker first tries to evict old znodes, then young ones.
+ *
+ * Since the shrinker is global, it has to protect against races with FS
+ * un-mounts, which is done by the 'ubifs_infos_lock' and 'c->umount_mutex'.
+ */
+#include "ubifs.h"
+/* List of all UBIFS file-system instances */
+LIST_HEAD(ubifs_infos);
+/*
+ * We number each shrinker run and record the number on the ubifs_info structure
+ * so that we can easily work out which ubifs_info structures have already been
+ * done by the current run.
+ */
+static unsigned int shrinker_run_no;
+/* Protects 'ubifs_infos' list */
+DEFINE_SPINLOCK(ubifs_infos_lock);
+/* Global clean znode counter (for all mounted UBIFS instances) */
+atomic_long_t ubifs_clean_zn_cnt;
+/**
+ * shrink_tnc - shrink TNC tree.
+ * @c: UBIFS file-system description object
+ * @nr: number of znodes to free
+ * @age: the age of znodes to free
+ * @contention: if any contention, this is set to %1
+ *
+ * This function traverses TNC tree and frees clean znodes. It does not free
+ * clean znodes which younger then @age. Returns number of freed znodes.
+ */
+static int shrink_tnc(struct ubifs_info *c, int nr, int age, int *contention)
+{
+        int total_freed = 0;
+        struct ubifs_znode *znode, *zprev;
+        int time = get_seconds();
+        ubifs_assert(mutex_is_locked(&c->umount_mutex));
+        ubifs_assert(mutex_is_locked(&c->tnc_mutex));
+        if (!c->zroot.znode || atomic_long_read(&c->clean_zn_cnt) == 0)
+                return 0;
+        /*
+         * Traverse the TNC tree in levelorder manner, so that it is possible
+         * to destroy large sub-trees. Indeed, if a znode is old, then all its
+         * children are older or of the same age.
+         *
+         * Note, we are holding 'c->tnc_mutex', so we do not have to lock the
+         * 'c->space_lock' when _reading_ 'c->clean_zn_cnt', because it is
+         * changed only when the 'c->tnc_mutex' is held.
+         */
+        zprev = NULL;
+        znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL);
+        while (znode && total_freed < nr &&
+               atomic_long_read(&c->clean_zn_cnt) > 0) {
+                int freed;
+                /*
+                 * If the znode is clean, but it is in the 'c->cnext' list, this
+                 * means that this znode has just been written to flash as a
+                 * part of commit and was marked clean. They will be removed
+                 * from the list at end commit. We cannot change the list,
+                 * because it is not protected by any mutex (design decision to
+                 * make commit really independent and parallel to main I/O). So
+                 * we just skip these znodes.
+                 *
+                 * Note, the 'clean_zn_cnt' counters are not updated until
+                 * after the commit, so the UBIFS shrinker does not report
+                 * the znodes which are in the 'c->cnext' list as freeable.
+                 *
+                 * Also note, if the root of a sub-tree is not in 'c->cnext',
+                 * then the whole sub-tree is not in 'c->cnext' as well, so it
+                 * is safe to dump whole sub-tree.
+                 */
+                if (znode->cnext) {
+                        /*
+                         * Very soon these znodes will be removed from the list
+                         * and become freeable.
+                         */
+                        *contention = 1;
+                } else if (!ubifs_zn_dirty(znode) &&
+                           abs(time - znode->time) >= age) {
+                        if (znode->parent)
+                                znode->parent->zbranch[znode->iip].znode = NULL;
+                        else
+                                c->zroot.znode = NULL;
+                        freed = ubifs_destroy_tnc_subtree(znode);
+                        atomic_long_sub(freed, &ubifs_clean_zn_cnt);
+                        atomic_long_sub(freed, &c->clean_zn_cnt);
+                        ubifs_assert(atomic_long_read(&c->clean_zn_cnt) >= 0);
+                        total_freed += freed;
+                        znode = zprev;
+                }
+                if (unlikely(!c->zroot.znode))
+                        break;
+                zprev = znode;
+                znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode);
+                cond_resched();
+        }
+        return total_freed;
+}
+/**
+ * shrink_tnc_trees - shrink UBIFS TNC trees.
+ * @nr: number of znodes to free
+ * @age: the age of znodes to free
+ * @contention: if any contention, this is set to %1
+ *
+ * This function walks the list of mounted UBIFS file-systems and frees clean
+ * znodes which are older then @age, until at least @nr znodes are freed.
+ * Returns the number of freed znodes.
+ */
+static int shrink_tnc_trees(int nr, int age, int *contention)
+{
+        struct ubifs_info *c;
+        struct list_head *p;
+        unsigned int run_no;
+        int freed = 0;
+        spin_lock(&ubifs_infos_lock);
+        do {
+                run_no = ++shrinker_run_no;
+        } while (run_no == 0);
+        /* Iterate over all mounted UBIFS file-systems and try to shrink them */
+        p = ubifs_infos.next;
+        while (p != &ubifs_infos) {
+                c = list_entry(p, struct ubifs_info, infos_list);
+                /*
+                 * We move the ones we do to the end of the list, so we stop
+                 * when we see one we have already done.
+                 */
+                if (c->shrinker_run_no == run_no)
+                        break;
+                if (!mutex_trylock(&c->umount_mutex)) {
+                        /* Some un-mount is in progress, try next FS */
+                        *contention = 1;
+                        p = p->next;
+                        continue;
+                }
+                /*
+                 * We're holding 'c->umount_mutex', so the file-system won't go
+                 * away.
+                 */
+                if (!mutex_trylock(&c->tnc_mutex)) {
+                        mutex_unlock(&c->umount_mutex);
+                        *contention = 1;
+                        p = p->next;
+                        continue;
+                }
+                spin_unlock(&ubifs_infos_lock);
+                /*
+                 * OK, now we have TNC locked, the file-system cannot go away -
+                 * it is safe to reap the cache.
+                 */
+                c->shrinker_run_no = run_no;
+                freed += shrink_tnc(c, nr, age, contention);
+                mutex_unlock(&c->tnc_mutex);
+                spin_lock(&ubifs_infos_lock);
+                /* Get the next list element before we move this one */
+                p = p->next;
+                /*
+                 * Move this one to the end of the list to provide some
+                 * fairness.
+                 */
+                list_del(&c->infos_list);
+                list_add_tail(&c->infos_list, &ubifs_infos);
+                mutex_unlock(&c->umount_mutex);
+                if (freed >= nr)
+                        break;
+        }
+        spin_unlock(&ubifs_infos_lock);
+        return freed;
+}
+/**
+ * kick_a_thread - kick a background thread to start commit.
+ *
+ * This function kicks a background thread to start background commit. Returns
+ * %-1 if a thread was kicked or there is another reason to assume the memory
+ * will soon be freed or become freeable. If there are no dirty znodes, returns
+ * %0.
+ */
+static int kick_a_thread(void)
+{
+        int i;
+        struct ubifs_info *c;
+        /*
+         * Iterate over all mounted UBIFS file-systems and find out if there is
+         * already an ongoing commit operation there. If no, then iterate for
+         * the second time and initiate background commit.
+         */
+        spin_lock(&ubifs_infos_lock);
+        for (i = 0; i < 2; i++) {
+                list_for_each_entry(c, &ubifs_infos, infos_list) {
+                        long dirty_zn_cnt;
+                        if (!mutex_trylock(&c->umount_mutex)) {
+                                /*
+                                 * Some un-mount is in progress, it will
+                                 * certainly free memory, so just return.
+                                 */
+                                spin_unlock(&ubifs_infos_lock);
+                                return -1;
+                        }
+                        dirty_zn_cnt = atomic_long_read(&c->dirty_zn_cnt);
+                        if (!dirty_zn_cnt || c->cmt_state == COMMIT_BROKEN ||
+                            c->ro_media) {
+                                mutex_unlock(&c->umount_mutex);
+                                continue;
+                        }
+                        if (c->cmt_state != COMMIT_RESTING) {
+                                spin_unlock(&ubifs_infos_lock);
+                                mutex_unlock(&c->umount_mutex);
+                                return -1;
+                        }
+                        if (i == 1) {
+                                list_del(&c->infos_list);
+                                list_add_tail(&c->infos_list, &ubifs_infos);
+                                spin_unlock(&ubifs_infos_lock);
+                                ubifs_request_bg_commit(c);
+                                mutex_unlock(&c->umount_mutex);
+                                return -1;
+                        }
+                        mutex_unlock(&c->umount_mutex);
+                }
+        }
+        spin_unlock(&ubifs_infos_lock);
+        return 0;
+}
+int ubifs_shrinker(int nr, gfp_t gfp_mask)
+{
+        int freed, contention = 0;
+        long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt);
+        if (nr == 0)
+                return clean_zn_cnt;
+        if (!clean_zn_cnt) {
+                /*
+                 * No clean znodes, nothing to reap. All we can do in this case
+                 * is to kick background threads to start commit, which will
+                 * probably make clean znodes which, in turn, will be freeable.
+                 * And we return -1 which means will make VM call us again
+                 * later.
+                 */
+                dbg_tnc("no clean znodes, kick a thread");
+                return kick_a_thread();
+        }
+        freed = shrink_tnc_trees(nr, OLD_ZNODE_AGE, &contention);
+        if (freed >= nr)
+                goto out;
+        dbg_tnc("not enough old znodes, try to free young ones");
+        freed += shrink_tnc_trees(nr - freed, YOUNG_ZNODE_AGE, &contention);
+        if (freed >= nr)
+                goto out;
+        dbg_tnc("not enough young znodes, free all");
+        freed += shrink_tnc_trees(nr - freed, 0, &contention);
+        if (!freed && contention) {
+                dbg_tnc("freed nothing, but contention");
+                return -1;
+        }
+out:
+        dbg_tnc("%d znodes were freed, requested %d", freed, nr);
+        return freed;
+}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
new file mode 100644
index 000000000000..00eb9c68ad03
--- /dev/null
+++ b/fs/ubifs/super.c
@@ -0,0 +1,1951 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+/*
+ * This file implements UBIFS initialization and VFS superblock operations. Some
+ * initialization stuff which is rather large and complex is placed at
+ * corresponding subsystems, but most of it is here.
+ */
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/random.h>
+#include <linux/kthread.h>
+#include <linux/parser.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include "ubifs.h"
+/* Slab cache for UBIFS inodes */
+struct kmem_cache *ubifs_inode_slab;
+/* UBIFS TNC shrinker description */
+static struct shrinker ubifs_shrinker_info = {
+        .shrink = ubifs_shrinker,
+        .seeks = DEFAULT_SEEKS,
+};
+/**
+ * validate_inode - validate inode.
+ * @c: UBIFS file-system description object
+ * @inode: the inode to validate
+ *
+ * This is a helper function for 'ubifs_iget()' which validates various fields
+ * of a newly built inode to make sure they contain sane values and prevent
+ * possible vulnerabilities. Returns zero if the inode is all right and
+ * a non-zero error code if not.
+ */
+static int validate_inode(struct ubifs_info *c, const struct inode *inode)
+{
+        int err;
+        const struct ubifs_inode *ui = ubifs_inode(inode);
+        if (inode->i_size > c->max_inode_sz) {
+                ubifs_err("inode is too large (%lld)",
+                          (long long)inode->i_size);
+                return 1;
+        }
+        if (ui->compr_type < 0 || ui->compr_type >= UBIFS_COMPR_TYPES_CNT) {
+                ubifs_err("unknown compression type %d", ui->compr_type);
+                return 2;
+        }
+        if (ui->xattr_names + ui->xattr_cnt > XATTR_LIST_MAX)
+                return 3;
+        if (ui->data_len < 0 || ui->data_len > UBIFS_MAX_INO_DATA)
+                return 4;
+        if (ui->xattr && (inode->i_mode & S_IFMT) != S_IFREG)
+                return 5;
+        if (!ubifs_compr_present(ui->compr_type)) {
+                ubifs_warn("inode %lu uses '%s' compression, but it was not "
+                           "compiled in", inode->i_ino,
+                           ubifs_compr_name(ui->compr_type));
+        }
+        err = dbg_check_dir_size(c, inode);
+        return err;
+}
+struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
+{
+        int err;
+        union ubifs_key key;
+        struct ubifs_ino_node *ino;
+        struct ubifs_info *c = sb->s_fs_info;
+        struct inode *inode;
+        struct ubifs_inode *ui;
+        dbg_gen("inode %lu", inum);
+        inode = iget_locked(sb, inum);
+        if (!inode)
+                return ERR_PTR(-ENOMEM);
+        if (!(inode->i_state & I_NEW))
+                return inode;
+        ui = ubifs_inode(inode);
+        ino = kmalloc(UBIFS_MAX_INO_NODE_SZ, GFP_NOFS);
+        if (!ino) {
+                err = -ENOMEM;
+                goto out;
+        }
+        ino_key_init(c, &key, inode->i_ino);
+        err = ubifs_tnc_lookup(c, &key, ino);
+        if (err)
+                goto out_ino;
+        inode->i_flags |= (S_NOCMTIME | S_NOATIME);
+        inode->i_nlink = le32_to_cpu(ino->nlink);
+        inode->i_uid   = le32_to_cpu(ino->uid);
+        inode->i_gid   = le32_to_cpu(ino->gid);
+        inode->i_atime.tv_sec  = (int64_t)le64_to_cpu(ino->atime_sec);
+        inode->i_atime.tv_nsec = le32_to_cpu(ino->atime_nsec);
+        inode->i_mtime.tv_sec  = (int64_t)le64_to_cpu(ino->mtime_sec);
+        inode->i_mtime.tv_nsec = le32_to_cpu(ino->mtime_nsec);
+        inode->i_ctime.tv_sec  = (int64_t)le64_to_cpu(ino->ctime_sec);
+        inode->i_ctime.tv_nsec = le32_to_cpu(ino->ctime_nsec);
+        inode->i_mode = le32_to_cpu(ino->mode);
+        inode->i_size = le64_to_cpu(ino->size);
+        ui->data_len    = le32_to_cpu(ino->data_len);
+        ui->flags       = le32_to_cpu(ino->flags);
+        ui->compr_type  = le16_to_cpu(ino->compr_type);
+        ui->creat_sqnum = le64_to_cpu(ino->creat_sqnum);
+        ui->xattr_cnt   = le32_to_cpu(ino->xattr_cnt);
+        ui->xattr_size  = le32_to_cpu(ino->xattr_size);
+        ui->xattr_names = le32_to_cpu(ino->xattr_names);
+        ui->synced_i_size = ui->ui_size = inode->i_size;
+        ui->xattr = (ui->flags & UBIFS_XATTR_FL) ? 1 : 0;
+        err = validate_inode(c, inode);
+        if (err)
+                goto out_invalid;
+        /* Disable readahead */
+        inode->i_mapping->backing_dev_info = &c->bdi;
+        switch (inode->i_mode & S_IFMT) {
+        case S_IFREG:
+                inode->i_mapping->a_ops = &ubifs_file_address_operations;
+                inode->i_op = &ubifs_file_inode_operations;
+                inode->i_fop = &ubifs_file_operations;
+                if (ui->xattr) {
+                        ui->data = kmalloc(ui->data_len + 1, GFP_NOFS);
+                        if (!ui->data) {
+                                err = -ENOMEM;
+                                goto out_ino;
+                        }
+                        memcpy(ui->data, ino->data, ui->data_len);
+                        ((char *)ui->data)[ui->data_len] = '\0';
+                } else if (ui->data_len != 0) {
+                        err = 10;
+                        goto out_invalid;
+                }
+                break;
+        case S_IFDIR:
+                inode->i_op  = &ubifs_dir_inode_operations;
+                inode->i_fop = &ubifs_dir_operations;
+                if (ui->data_len != 0) {
+                        err = 11;
+                        goto out_invalid;
+                }
+                break;
+        case S_IFLNK:
+                inode->i_op = &ubifs_symlink_inode_operations;
+                if (ui->data_len <= 0 || ui->data_len > UBIFS_MAX_INO_DATA) {
+                        err = 12;
+                        goto out_invalid;
+                }
+                ui->data = kmalloc(ui->data_len + 1, GFP_NOFS);
+                if (!ui->data) {
+                        err = -ENOMEM;
+                        goto out_ino;
+                }
+                memcpy(ui->data, ino->data, ui->data_len);
+                ((char *)ui->data)[ui->data_len] = '\0';
+                break;
+        case S_IFBLK:
+        case S_IFCHR:
+        {
+                dev_t rdev;
+                union ubifs_dev_desc *dev;
+                ui->data = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS);
+                if (!ui->data) {
+                        err = -ENOMEM;
+                        goto out_ino;
+                }
+                dev = (union ubifs_dev_desc *)ino->data;
+                if (ui->data_len == sizeof(dev->new))
+                        rdev = new_decode_dev(le32_to_cpu(dev->new));
+                else if (ui->data_len == sizeof(dev->huge))
+                        rdev = huge_decode_dev(le64_to_cpu(dev->huge));
+                else {
+                        err = 13;
+                        goto out_invalid;
+                }
+                memcpy(ui->data, ino->data, ui->data_len);
+                inode->i_op = &ubifs_file_inode_operations;
+                init_special_inode(inode, inode->i_mode, rdev);
+                break;
+        }
+        case S_IFSOCK:
+        case S_IFIFO:
+                inode->i_op = &ubifs_file_inode_operations;
+                init_special_inode(inode, inode->i_mode, 0);
+                if (ui->data_len != 0) {
+                        err = 14;
+                        goto out_invalid;
+                }
+                break;
+        default:
+                err = 15;
+                goto out_invalid;
+        }
+        kfree(ino);
+        ubifs_set_inode_flags(inode);
+        unlock_new_inode(inode);
+        return inode;
+out_invalid:
+        ubifs_err("inode %lu validation failed, error %d", inode->i_ino, err);
+        dbg_dump_node(c, ino);
+        dbg_dump_inode(c, inode);
+        err = -EINVAL;
+out_ino:
+        kfree(ino);
+out:
+        ubifs_err("failed to read inode %lu, error %d", inode->i_ino, err);
+        iget_failed(inode);
+        return ERR_PTR(err);
+}
+static struct inode *ubifs_alloc_inode(struct super_block *sb)
+{
+        struct ubifs_inode *ui;
+        ui = kmem_cache_alloc(ubifs_inode_slab, GFP_NOFS);
+        if (!ui)
+                return NULL;
+        memset((void *)ui + sizeof(struct inode), 0,
+               sizeof(struct ubifs_inode) - sizeof(struct inode));
+        mutex_init(&ui->ui_mutex);
+        spin_lock_init(&ui->ui_lock);
+        return &ui->vfs_inode;
+};
+static void ubifs_destroy_inode(struct inode *inode)
+{
+        struct ubifs_inode *ui = ubifs_inode(inode);
+        kfree(ui->data);
+        kmem_cache_free(ubifs_inode_slab, inode);
+}
+/*
+ * Note, Linux write-back code calls this without 'i_mutex'.
+ */
+static int ubifs_write_inode(struct inode *inode, int wait)
+{
+        int err;
+        struct ubifs_info *c = inode->i_sb->s_fs_info;
+        struct ubifs_inode *ui = ubifs_inode(inode);
+        ubifs_assert(!ui->xattr);
+        if (is_bad_inode(inode))
+                return 0;
+        mutex_lock(&ui->ui_mutex);
+        /*
+         * Due to races between write-back forced by budgeting
+         * (see 'sync_some_inodes()') and pdflush write-back, the inode may
+         * have already been synchronized, do not do this again. This might
+         * also happen if it was synchronized in an VFS operation, e.g.
+         * 'ubifs_link()'.
+         */
+        if (!ui->dirty) {
+                mutex_unlock(&ui->ui_mutex);
+                return 0;
+        }
+        dbg_gen("inode %lu", inode->i_ino);
+        err = ubifs_jnl_write_inode(c, inode, 0);
+        if (err)
+                ubifs_err("can't write inode %lu, error %d", inode->i_ino, err);
+        ui->dirty = 0;
+        mutex_unlock(&ui->ui_mutex);
+        ubifs_release_dirty_inode_budget(c, ui);
+        return err;
+}
+static void ubifs_delete_inode(struct inode *inode)
+{
+        int err;
+        struct ubifs_info *c = inode->i_sb->s_fs_info;
+        if (ubifs_inode(inode)->xattr)
+                /*
+                 * Extended attribute inode deletions are fully handled in
+                 * 'ubifs_removexattr()'. These inodes are special and have
+                 * limited usage, so there is nothing to do here.
+                 */
+                goto out;
+        dbg_gen("inode %lu", inode->i_ino);
+        ubifs_assert(!atomic_read(&inode->i_count));
+        ubifs_assert(inode->i_nlink == 0);
+        truncate_inode_pages(&inode->i_data, 0);
+        if (is_bad_inode(inode))
+                goto out;
+        ubifs_inode(inode)->ui_size = inode->i_size = 0;
+        err = ubifs_jnl_write_inode(c, inode, 1);
+        if (err)
+                /*
+                 * Worst case we have a lost orphan inode wasting space, so a
+                 * simple error message is ok here.
+                 */
+                ubifs_err("can't write inode %lu, error %d", inode->i_ino, err);
+out:
+        clear_inode(inode);
+}
+static void ubifs_dirty_inode(struct inode *inode)
+{
+        struct ubifs_inode *ui = ubifs_inode(inode);
+        ubifs_assert(mutex_is_locked(&ui->ui_mutex));
+        if (!ui->dirty) {
+                ui->dirty = 1;
+                dbg_gen("inode %lu",  inode->i_ino);
+        }
+}
+static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+        struct ubifs_info *c = dentry->d_sb->s_fs_info;
+        unsigned long long free;
+        free = ubifs_budg_get_free_space(c);
+        dbg_gen("free space %lld bytes (%lld blocks)",
+                free, free >> UBIFS_BLOCK_SHIFT);
+        buf->f_type = UBIFS_SUPER_MAGIC;
+        buf->f_bsize = UBIFS_BLOCK_SIZE;
+        buf->f_blocks = c->block_cnt;
+        buf->f_bfree = free >> UBIFS_BLOCK_SHIFT;
+        if (free > c->report_rp_size)
+                buf->f_bavail = (free - c->report_rp_size) >> UBIFS_BLOCK_SHIFT;
+        else
+                buf->f_bavail = 0;
+        buf->f_files = 0;
+        buf->f_ffree = 0;
+        buf->f_namelen = UBIFS_MAX_NLEN;
+        return 0;
+}
+static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt)
+{
+        struct ubifs_info *c = mnt->mnt_sb->s_fs_info;
+        if (c->mount_opts.unmount_mode == 2)
+                seq_printf(s, ",fast_unmount");
+        else if (c->mount_opts.unmount_mode == 1)
+                seq_printf(s, ",norm_unmount");
+        return 0;
+}
+static int ubifs_sync_fs(struct super_block *sb, int wait)
+{
+        struct ubifs_info *c = sb->s_fs_info;
+        int i, ret = 0, err;
+        if (c->jheads)
+                for (i = 0; i < c->jhead_cnt; i++) {
+                        err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
+                        if (err && !ret)
+                                ret = err;
+                }
+        /*
+         * We ought to call sync for c->ubi but it does not have one. If it had
+         * it would in turn call mtd->sync, however mtd operations are
+         * synchronous anyway, so we don't lose any sleep here.
+         */
+        return ret;
+}
+/**
+ * init_constants_early - initialize UBIFS constants.
+ * @c: UBIFS file-system description object
+ *
+ * This function initialize UBIFS constants which do not need the superblock to
+ * be read. It also checks that the UBI volume satisfies basic UBIFS
+ * requirements. Returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int init_constants_early(struct ubifs_info *c)
+{
+        if (c->vi.corrupted) {
+                ubifs_warn("UBI volume is corrupted - read-only mode");
+                c->ro_media = 1;
+        }
+        if (c->di.ro_mode) {
+                ubifs_msg("read-only UBI device");
+                c->ro_media = 1;
+        }
+        if (c->vi.vol_type == UBI_STATIC_VOLUME) {
+                ubifs_msg("static UBI volume - read-only mode");
+                c->ro_media = 1;
+        }
+        c->leb_cnt = c->vi.size;
+        c->leb_size = c->vi.usable_leb_size;
+        c->half_leb_size = c->leb_size / 2;
+        c->min_io_size = c->di.min_io_size;
+        c->min_io_shift = fls(c->min_io_size) - 1;
+        if (c->leb_size < UBIFS_MIN_LEB_SZ) {
+                ubifs_err("too small LEBs (%d bytes), min. is %d bytes",
+                          c->leb_size, UBIFS_MIN_LEB_SZ);
+                return -EINVAL;
+        }
+        if (c->leb_cnt < UBIFS_MIN_LEB_CNT) {
+                ubifs_err("too few LEBs (%d), min. is %d",
+                          c->leb_cnt, UBIFS_MIN_LEB_CNT);
+                return -EINVAL;
+        }
+        if (!is_power_of_2(c->min_io_size)) {
+                ubifs_err("bad min. I/O size %d", c->min_io_size);
+                return -EINVAL;
+        }
+        /*
+         * UBIFS aligns all node to 8-byte boundary, so to make function in
+         * io.c simpler, assume minimum I/O unit size to be 8 bytes if it is
+         * less than 8.
+         */
+        if (c->min_io_size < 8) {
+                c->min_io_size = 8;
+                c->min_io_shift = 3;
+        }
+        c->ref_node_alsz = ALIGN(UBIFS_REF_NODE_SZ, c->min_io_size);
+        c->mst_node_alsz = ALIGN(UBIFS_MST_NODE_SZ, c->min_io_size);
+        /*
+         * Initialize node length ranges which are mostly needed for node
+         * length validation.
+         */
+        c->ranges[UBIFS_PAD_NODE].len  = UBIFS_PAD_NODE_SZ;
+        c->ranges[UBIFS_SB_NODE].len   = UBIFS_SB_NODE_SZ;
+        c->ranges[UBIFS_MST_NODE].len  = UBIFS_MST_NODE_SZ;
+        c->ranges[UBIFS_REF_NODE].len  = UBIFS_REF_NODE_SZ;
+        c->ranges[UBIFS_TRUN_NODE].len = UBIFS_TRUN_NODE_SZ;
+        c->ranges[UBIFS_CS_NODE].len   = UBIFS_CS_NODE_SZ;
+        c->ranges[UBIFS_INO_NODE].min_len  = UBIFS_INO_NODE_SZ;
+        c->ranges[UBIFS_INO_NODE].max_len  = UBIFS_MAX_INO_NODE_SZ;
+        c->ranges[UBIFS_ORPH_NODE].min_len =
+                                UBIFS_ORPH_NODE_SZ + sizeof(__le64);
+        c->ranges[UBIFS_ORPH_NODE].max_len = c->leb_size;
+        c->ranges[UBIFS_DENT_NODE].min_len = UBIFS_DENT_NODE_SZ;
+        c->ranges[UBIFS_DENT_NODE].max_len = UBIFS_MAX_DENT_NODE_SZ;
+        c->ranges[UBIFS_XENT_NODE].min_len = UBIFS_XENT_NODE_SZ;
+        c->ranges[UBIFS_XENT_NODE].max_len = UBIFS_MAX_XENT_NODE_SZ;
+        c->ranges[UBIFS_DATA_NODE].min_len = UBIFS_DATA_NODE_SZ;
+        c->ranges[UBIFS_DATA_NODE].max_len = UBIFS_MAX_DATA_NODE_SZ;
+        /*
+         * Minimum indexing node size is amended later when superblock is
+         * read and the key length is known.
+         */
+        c->ranges[UBIFS_IDX_NODE].min_len = UBIFS_IDX_NODE_SZ + UBIFS_BRANCH_SZ;
+        /*
+         * Maximum indexing node size is amended later when superblock is
+         * read and the fanout is known.
+         */
+        c->ranges[UBIFS_IDX_NODE].max_len = INT_MAX;
+        /*
+         * Initialize dead and dark LEB space watermarks.
+         *
+         * Dead space is the space which cannot be used. Its watermark is
+         * equivalent to min. I/O unit or minimum node size if it is greater
+         * then min. I/O unit.
+         *
+         * Dark space is the space which might be used, or might not, depending
+         * on which node should be written to the LEB. Its watermark is
+         * equivalent to maximum UBIFS node size.
+         */
+        c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size);
+        c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size);
+        return 0;
+}
+/**
+ * bud_wbuf_callback - bud LEB write-buffer synchronization call-back.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB the write-buffer was synchronized to
+ * @free: how many free bytes left in this LEB
+ * @pad: how many bytes were padded
+ *
+ * This is a callback function which is called by the I/O unit when the
+ * write-buffer is synchronized. We need this to correctly maintain space
+ * accounting in bud logical eraseblocks. This function returns zero in case of
+ * success and a negative error code in case of failure.
+ *
+ * This function actually belongs to the journal, but we keep it here because
+ * we want to keep it static.
+ */
+static int bud_wbuf_callback(struct ubifs_info *c, int lnum, int free, int pad)
+{
+        return ubifs_update_one_lp(c, lnum, free, pad, 0, 0);
+}
+/*
+ * init_constants_late - initialize UBIFS constants.
+ * @c: UBIFS file-system description object
+ *
+ * This is a helper function which initializes various UBIFS constants after
+ * the superblock has been read. It also checks various UBIFS parameters and
+ * makes sure they are all right. Returns zero in case of success and a
+ * negative error code in case of failure.
+ */
+static int init_constants_late(struct ubifs_info *c)
+{
+        int tmp, err;
+        uint64_t tmp64;
+        c->main_bytes = (long long)c->main_lebs * c->leb_size;
+        c->max_znode_sz = sizeof(struct ubifs_znode) +
+                                c->fanout * sizeof(struct ubifs_zbranch);
+        tmp = ubifs_idx_node_sz(c, 1);
+        c->ranges[UBIFS_IDX_NODE].min_len = tmp;
+        c->min_idx_node_sz = ALIGN(tmp, 8);
+        tmp = ubifs_idx_node_sz(c, c->fanout);
+        c->ranges[UBIFS_IDX_NODE].max_len = tmp;
+        c->max_idx_node_sz = ALIGN(tmp, 8);
+        /* Make sure LEB size is large enough to fit full commit */
+        tmp = UBIFS_CS_NODE_SZ + UBIFS_REF_NODE_SZ * c->jhead_cnt;
+        tmp = ALIGN(tmp, c->min_io_size);
+        if (tmp > c->leb_size) {
+                dbg_err("too small LEB size %d, at least %d needed",
+                        c->leb_size, tmp);
+                return -EINVAL;
+        }
+        /*
+         * Make sure that the log is large enough to fit reference nodes for
+         * all buds plus one reserved LEB.
+         */
+        tmp64 = c->max_bud_bytes;
+        tmp = do_div(tmp64, c->leb_size);
+        c->max_bud_cnt = tmp64 + !!tmp;
+        tmp = (c->ref_node_alsz * c->max_bud_cnt + c->leb_size - 1);
+        tmp /= c->leb_size;
+        tmp += 1;
+        if (c->log_lebs < tmp) {
+                dbg_err("too small log %d LEBs, required min. %d LEBs",
+                        c->log_lebs, tmp);
+                return -EINVAL;
+        }
+        /*
+         * When budgeting we assume worst-case scenarios when the pages are not
+         * be compressed and direntries are of the maximum size.
+         *
+         * Note, data, which may be stored in inodes is budgeted separately, so
+         * it is not included into 'c->inode_budget'.
+         */
+        c->page_budget = UBIFS_MAX_DATA_NODE_SZ * UBIFS_BLOCKS_PER_PAGE;
+        c->inode_budget = UBIFS_INO_NODE_SZ;
+        c->dent_budget = UBIFS_MAX_DENT_NODE_SZ;
+        /*
+         * When the amount of flash space used by buds becomes
+         * 'c->max_bud_bytes', UBIFS just blocks all writers and starts commit.
+         * The writers are unblocked when the commit is finished. To avoid
+         * writers to be blocked UBIFS initiates background commit in advance,
+         * when number of bud bytes becomes above the limit defined below.
+         */
+        c->bg_bud_bytes = (c->max_bud_bytes * 13) >> 4;
+        /*
+         * Ensure minimum journal size. All the bytes in the journal heads are
+         * considered to be used, when calculating the current journal usage.
+         * Consequently, if the journal is too small, UBIFS will treat it as
+         * always full.
+         */
+        tmp64 = (uint64_t)(c->jhead_cnt + 1) * c->leb_size + 1;
+        if (c->bg_bud_bytes < tmp64)
+                c->bg_bud_bytes = tmp64;
+        if (c->max_bud_bytes < tmp64 + c->leb_size)
+                c->max_bud_bytes = tmp64 + c->leb_size;
+        err = ubifs_calc_lpt_geom(c);
+        if (err)
+                return err;
+        c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+        /*
+         * Calculate total amount of FS blocks. This number is not used
+         * internally because it does not make much sense for UBIFS, but it is
+         * necessary to report something for the 'statfs()' call.
+         *
+         * Subtract the LEB reserved for GC and the LEB which is reserved for
+         * deletions.
+         *
+         * Review 'ubifs_calc_available()' if changing this calculation.
+         */
+        tmp64 = c->main_lebs - 2;
+        tmp64 *= (uint64_t)c->leb_size - c->dark_wm;
+        tmp64 = ubifs_reported_space(c, tmp64);
+        c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT;
+        return 0;
+}
+/**
+ * take_gc_lnum - reserve GC LEB.
+ * @c: UBIFS file-system description object
+ *
+ * This function ensures that the LEB reserved for garbage collection is
+ * unmapped and is marked as "taken" in lprops. We also have to set free space
+ * to LEB size and dirty space to zero, because lprops may contain out-of-date
+ * information if the file-system was un-mounted before it has been committed.
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int take_gc_lnum(struct ubifs_info *c)
+{
+        int err;
+        if (c->gc_lnum == -1) {
+                ubifs_err("no LEB for GC");
+                return -EINVAL;
+        }
+        err = ubifs_leb_unmap(c, c->gc_lnum);
+        if (err)
+                return err;
+        /* And we have to tell lprops that this LEB is taken */
+        err = ubifs_change_one_lp(c, c->gc_lnum, c->leb_size, 0,
+                                  LPROPS_TAKEN, 0, 0);
+        return err;
+}
+/**
+ * alloc_wbufs - allocate write-buffers.
+ * @c: UBIFS file-system description object
+ *
+ * This helper function allocates and initializes UBIFS write-buffers. Returns
+ * zero in case of success and %-ENOMEM in case of failure.
+ */
+static int alloc_wbufs(struct ubifs_info *c)
+{
+        int i, err;
+        c->jheads = kzalloc(c->jhead_cnt * sizeof(struct ubifs_jhead),
+                           GFP_KERNEL);
+        if (!c->jheads)
+                return -ENOMEM;
+        /* Initialize journal heads */
+        for (i = 0; i < c->jhead_cnt; i++) {
+                INIT_LIST_HEAD(&c->jheads[i].buds_list);
+                err = ubifs_wbuf_init(c, &c->jheads[i].wbuf);
+                if (err)
+                        return err;
+                c->jheads[i].wbuf.sync_callback = &bud_wbuf_callback;
+                c->jheads[i].wbuf.jhead = i;
+        }
+        c->jheads[BASEHD].wbuf.dtype = UBI_SHORTTERM;
+        /*
+         * Garbage Collector head likely contains long-term data and
+         * does not need to be synchronized by timer.
+         */
+        c->jheads[GCHD].wbuf.dtype = UBI_LONGTERM;
+        c->jheads[GCHD].wbuf.timeout = 0;
+        return 0;
+}
+/**
+ * free_wbufs - free write-buffers.
+ * @c: UBIFS file-system description object
+ */
+static void free_wbufs(struct ubifs_info *c)
+{
+        int i;
+        if (c->jheads) {
+                for (i = 0; i < c->jhead_cnt; i++) {
+                        kfree(c->jheads[i].wbuf.buf);
+                        kfree(c->jheads[i].wbuf.inodes);
+                }
+                kfree(c->jheads);
+                c->jheads = NULL;
+        }
+}
+/**
+ * free_orphans - free orphans.
+ * @c: UBIFS file-system description object
+ */
+static void free_orphans(struct ubifs_info *c)
+{
+        struct ubifs_orphan *orph;
+        while (c->orph_dnext) {
+                orph = c->orph_dnext;
+                c->orph_dnext = orph->dnext;
+                list_del(&orph->list);
+                kfree(orph);
+        }
+        while (!list_empty(&c->orph_list)) {
+                orph = list_entry(c->orph_list.next, struct ubifs_orphan, list);
+                list_del(&orph->list);
+                kfree(orph);
+                dbg_err("orphan list not empty at unmount");
+        }
+        vfree(c->orph_buf);
+        c->orph_buf = NULL;
+}
+/**
+ * free_buds - free per-bud objects.
+ * @c: UBIFS file-system description object
+ */
+static void free_buds(struct ubifs_info *c)
+{
+        struct rb_node *this = c->buds.rb_node;
+        struct ubifs_bud *bud;
+        while (this) {
+                if (this->rb_left)
+                        this = this->rb_left;
+                else if (this->rb_right)
+                        this = this->rb_right;
+                else {
+                        bud = rb_entry(this, struct ubifs_bud, rb);
+                        this = rb_parent(this);
+                        if (this) {
+                                if (this->rb_left == &bud->rb)
+                                        this->rb_left = NULL;
+                                else
+                                        this->rb_right = NULL;
+                        }
+                        kfree(bud);
+                }
+        }
+}
+/**
+ * check_volume_empty - check if the UBI volume is empty.
+ * @c: UBIFS file-system description object
+ *
+ * This function checks if the UBIFS volume is empty by looking if its LEBs are
+ * mapped or not. The result of checking is stored in the @c->empty variable.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+static int check_volume_empty(struct ubifs_info *c)
+{
+        int lnum, err;
+        c->empty = 1;
+        for (lnum = 0; lnum < c->leb_cnt; lnum++) {
+                err = ubi_is_mapped(c->ubi, lnum);
+                if (unlikely(err < 0))
+                        return err;
+                if (err == 1) {
+                        c->empty = 0;
+                        break;
+                }
+                cond_resched();
+        }
+        return 0;
+}
+/*
+ * UBIFS mount options.
+ *
+ * Opt_fast_unmount: do not run a journal commit before un-mounting
+ * Opt_norm_unmount: run a journal commit before un-mounting
+ * Opt_err: just end of array marker
+ */
+enum {
+        Opt_fast_unmount,
+        Opt_norm_unmount,
+        Opt_err,
+};
+static match_table_t tokens = {
+        {Opt_fast_unmount, "fast_unmount"},
+        {Opt_norm_unmount, "norm_unmount"},
+        {Opt_err, NULL},
+};
+/**
+ * ubifs_parse_options - parse mount parameters.
+ * @c: UBIFS file-system description object
+ * @options: parameters to parse
+ * @is_remount: non-zero if this is FS re-mount
+ *
+ * This function parses UBIFS mount options and returns zero in case success
+ * and a negative error code in case of failure.
+ */
+static int ubifs_parse_options(struct ubifs_info *c, char *options,
+                               int is_remount)
+{
+        char *p;
+        substring_t args[MAX_OPT_ARGS];
+        if (!options)
+                return 0;
+        while ((p = strsep(&options, ","))) {
+                int token;
+                if (!*p)
+                        continue;
+                token = match_token(p, tokens, args);
+                switch (token) {
+                case Opt_fast_unmount:
+                        c->mount_opts.unmount_mode = 2;
+                        c->fast_unmount = 1;
+                        break;
+                case Opt_norm_unmount:
+                        c->mount_opts.unmount_mode = 1;
+                        c->fast_unmount = 0;
+                        break;
+                default:
+                        ubifs_err("unrecognized mount option \"%s\" "
+                                  "or missing value", p);
+                        return -EINVAL;
+                }
+        }
+        return 0;
+}
+/**
+ * destroy_journal - destroy journal data structures.
+ * @c: UBIFS file-system description object
+ *
+ * This function destroys journal data structures including those that may have
+ * been created by recovery functions.
+ */
+static void destroy_journal(struct ubifs_info *c)
+{
+        while (!list_empty(&c->unclean_leb_list)) {
+                struct ubifs_unclean_leb *ucleb;
+                ucleb = list_entry(c->unclean_leb_list.next,
+                                   struct ubifs_unclean_leb, list);
+                list_del(&ucleb->list);
+                kfree(ucleb);
+        }
+        while (!list_empty(&c->old_buds)) {
+                struct ubifs_bud *bud;
+                bud = list_entry(c->old_buds.next, struct ubifs_bud, list);
+                list_del(&bud->list);
+                kfree(bud);
+        }
+        ubifs_destroy_idx_gc(c);
+        ubifs_destroy_size_tree(c);
+        ubifs_tnc_close(c);
+        free_buds(c);
+}
+/**
+ * mount_ubifs - mount UBIFS file-system.
+ * @c: UBIFS file-system description object
+ *
+ * This function mounts UBIFS file system. Returns zero in case of success and
+ * a negative error code in case of failure.
+ *
+ * Note, the function does not de-allocate resources it it fails half way
+ * through, and the caller has to do this instead.
+ */
+static int mount_ubifs(struct ubifs_info *c)
+{
+        struct super_block *sb = c->vfs_sb;
+        int err, mounted_read_only = (sb->s_flags & MS_RDONLY);
+        long long x;
+        size_t sz;
+        err = init_constants_early(c);
+        if (err)
+                return err;
+#ifdef CONFIG_UBIFS_FS_DEBUG
+        c->dbg_buf = vmalloc(c->leb_size);
+        if (!c->dbg_buf)
+                return -ENOMEM;
+#endif
+        err = check_volume_empty(c);
+        if (err)
+                goto out_free;
+        if (c->empty && (mounted_read_only || c->ro_media)) {
+                /*
+                 * This UBI volume is empty, and read-only, or the file system
+                 * is mounted read-only - we cannot format it.
+                 */
+                ubifs_err("can't format empty UBI volume: read-only %s",
+                          c->ro_media ? "UBI volume" : "mount");
+                err = -EROFS;
+                goto out_free;
+        }
+        if (c->ro_media && !mounted_read_only) {
+                ubifs_err("cannot mount read-write - read-only media");
+                err = -EROFS;
+                goto out_free;
+        }
+        /*
+         * The requirement for the buffer is that it should fit indexing B-tree
+         * height amount of integers. We assume the height if the TNC tree will
+         * never exceed 64.
+         */
+        err = -ENOMEM;
+        c->bottom_up_buf = kmalloc(BOTTOM_UP_HEIGHT * sizeof(int), GFP_KERNEL);
+        if (!c->bottom_up_buf)
+                goto out_free;
+        c->sbuf = vmalloc(c->leb_size);
+        if (!c->sbuf)
+                goto out_free;
+        if (!mounted_read_only) {
+                c->ileb_buf = vmalloc(c->leb_size);
+                if (!c->ileb_buf)
+                        goto out_free;
+        }
+        err = ubifs_read_superblock(c);
+        if (err)
+                goto out_free;
+        /*
+         * Make sure the compressor which is set as the default on in the
+         * superblock was actually compiled in.
+         */
+        if (!ubifs_compr_present(c->default_compr)) {
+                ubifs_warn("'%s' compressor is set by superblock, but not "
+                           "compiled in", ubifs_compr_name(c->default_compr));
+                c->default_compr = UBIFS_COMPR_NONE;
+        }
+        dbg_failure_mode_registration(c);
+        err = init_constants_late(c);
+        if (err)
+                goto out_dereg;
+        sz = ALIGN(c->max_idx_node_sz, c->min_io_size);
+        sz = ALIGN(sz + c->max_idx_node_sz, c->min_io_size);
+        c->cbuf = kmalloc(sz, GFP_NOFS);
+        if (!c->cbuf) {
+                err = -ENOMEM;
+                goto out_dereg;
+        }
+        if (!mounted_read_only) {
+                err = alloc_wbufs(c);
+                if (err)
+                        goto out_cbuf;
+                /* Create background thread */
+                sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num,
+                        c->vi.vol_id);
+                c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name);
+                if (!c->bgt)
+                        c->bgt = ERR_PTR(-EINVAL);
+                if (IS_ERR(c->bgt)) {
+                        err = PTR_ERR(c->bgt);
+                        c->bgt = NULL;
+                        ubifs_err("cannot spawn \"%s\", error %d",
+                                  c->bgt_name, err);
+                        goto out_wbufs;
+                }
+                wake_up_process(c->bgt);
+        }
+        err = ubifs_read_master(c);
+        if (err)
+                goto out_master;
+        if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) {
+                ubifs_msg("recovery needed");
+                c->need_recovery = 1;
+                if (!mounted_read_only) {
+                        err = ubifs_recover_inl_heads(c, c->sbuf);
+                        if (err)
+                                goto out_master;
+                }
+        } else if (!mounted_read_only) {
+                /*
+                 * Set the "dirty" flag so that if we reboot uncleanly we
+                 * will notice this immediately on the next mount.
+                 */
+                c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY);
+                err = ubifs_write_master(c);
+                if (err)
+                        goto out_master;
+        }
+        err = ubifs_lpt_init(c, 1, !mounted_read_only);
+        if (err)
+                goto out_lpt;
+        err = dbg_check_idx_size(c, c->old_idx_sz);
+        if (err)
+                goto out_lpt;
+        err = ubifs_replay_journal(c);
+        if (err)
+                goto out_journal;
+        err = ubifs_mount_orphans(c, c->need_recovery, mounted_read_only);
+        if (err)
+                goto out_orphans;
+        if (!mounted_read_only) {
+                int lnum;
+                /* Check for enough free space */
+                if (ubifs_calc_available(c, c->min_idx_lebs) <= 0) {
+                        ubifs_err("insufficient available space");
+                        err = -EINVAL;
+                        goto out_orphans;
+                }
+                /* Check for enough log space */
+                lnum = c->lhead_lnum + 1;
+                if (lnum >= UBIFS_LOG_LNUM + c->log_lebs)
+                        lnum = UBIFS_LOG_LNUM;
+                if (lnum == c->ltail_lnum) {
+                        err = ubifs_consolidate_log(c);
+                        if (err)
+                                goto out_orphans;
+                }
+                if (c->need_recovery) {
+                        err = ubifs_recover_size(c);
+                        if (err)
+                                goto out_orphans;
+                        err = ubifs_rcvry_gc_commit(c);
+                } else
+                        err = take_gc_lnum(c);
+                if (err)
+                        goto out_orphans;
+                err = dbg_check_lprops(c);
+                if (err)
+                        goto out_orphans;
+        } else if (c->need_recovery) {
+                err = ubifs_recover_size(c);
+                if (err)
+                        goto out_orphans;
+        }
+        spin_lock(&ubifs_infos_lock);
+        list_add_tail(&c->infos_list, &ubifs_infos);
+        spin_unlock(&ubifs_infos_lock);
+        if (c->need_recovery) {
+                if (mounted_read_only)
+                        ubifs_msg("recovery deferred");
+                else {
+                        c->need_recovery = 0;
+                        ubifs_msg("recovery completed");
+                }
+        }
+        err = dbg_check_filesystem(c);
+        if (err)
+                goto out_infos;
+        ubifs_msg("mounted UBI device %d, volume %d", c->vi.ubi_num,
+                  c->vi.vol_id);
+        if (mounted_read_only)
+                ubifs_msg("mounted read-only");
+        x = (long long)c->main_lebs * c->leb_size;
+        ubifs_msg("file system size: %lld bytes (%lld KiB, %lld MiB, %d LEBs)",
+                  x, x >> 10, x >> 20, c->main_lebs);
+        x = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes;
+        ubifs_msg("journal size: %lld bytes (%lld KiB, %lld MiB, %d LEBs)",
+                  x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt);
+        ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr));
+        ubifs_msg("media format %d, latest format %d",
+                  c->fmt_version, UBIFS_FORMAT_VERSION);
+        dbg_msg("compiled on:         " __DATE__ " at " __TIME__);
+        dbg_msg("min. I/O unit size:  %d bytes", c->min_io_size);
+        dbg_msg("LEB size:            %d bytes (%d KiB)",
+                c->leb_size, c->leb_size / 1024);
+        dbg_msg("data journal heads:  %d",
+                c->jhead_cnt - NONDATA_JHEADS_CNT);
+        dbg_msg("UUID:                %02X%02X%02X%02X-%02X%02X"
+               "-%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X",
+               c->uuid[0], c->uuid[1], c->uuid[2], c->uuid[3],
+               c->uuid[4], c->uuid[5], c->uuid[6], c->uuid[7],
+               c->uuid[8], c->uuid[9], c->uuid[10], c->uuid[11],
+               c->uuid[12], c->uuid[13], c->uuid[14], c->uuid[15]);
+        dbg_msg("fast unmount:        %d", c->fast_unmount);
+        dbg_msg("big_lpt              %d", c->big_lpt);
+        dbg_msg("log LEBs:            %d (%d - %d)",
+                c->log_lebs, UBIFS_LOG_LNUM, c->log_last);
+        dbg_msg("LPT area LEBs:       %d (%d - %d)",
+                c->lpt_lebs, c->lpt_first, c->lpt_last);
+        dbg_msg("orphan area LEBs:    %d (%d - %d)",
+                c->orph_lebs, c->orph_first, c->orph_last);
+        dbg_msg("main area LEBs:      %d (%d - %d)",
+                c->main_lebs, c->main_first, c->leb_cnt - 1);
+        dbg_msg("index LEBs:          %d", c->lst.idx_lebs);
+        dbg_msg("total index bytes:   %lld (%lld KiB, %lld MiB)",
+                c->old_idx_sz, c->old_idx_sz >> 10, c->old_idx_sz >> 20);
+        dbg_msg("key hash type:       %d", c->key_hash_type);
+        dbg_msg("tree fanout:         %d", c->fanout);
+        dbg_msg("reserved GC LEB:     %d", c->gc_lnum);
+        dbg_msg("first main LEB:      %d", c->main_first);
+        dbg_msg("dead watermark:      %d", c->dead_wm);
+        dbg_msg("dark watermark:      %d", c->dark_wm);
+        x = (long long)c->main_lebs * c->dark_wm;
+        dbg_msg("max. dark space:     %lld (%lld KiB, %lld MiB)",
+                x, x >> 10, x >> 20);
+        dbg_msg("maximum bud bytes:   %lld (%lld KiB, %lld MiB)",
+                c->max_bud_bytes, c->max_bud_bytes >> 10,
+                c->max_bud_bytes >> 20);
+        dbg_msg("BG commit bud bytes: %lld (%lld KiB, %lld MiB)",
+                c->bg_bud_bytes, c->bg_bud_bytes >> 10,
+                c->bg_bud_bytes >> 20);
+        dbg_msg("current bud bytes    %lld (%lld KiB, %lld MiB)",
+                c->bud_bytes, c->bud_bytes >> 10, c->bud_bytes >> 20);
+        dbg_msg("max. seq. number:    %llu", c->max_sqnum);
+        dbg_msg("commit number:       %llu", c->cmt_no);
+        return 0;
+out_infos:
+        spin_lock(&ubifs_infos_lock);
+        list_del(&c->infos_list);
+        spin_unlock(&ubifs_infos_lock);
+out_orphans:
+        free_orphans(c);
+out_journal:
+        destroy_journal(c);
+out_lpt:
+        ubifs_lpt_free(c, 0);
+out_master:
+        kfree(c->mst_node);
+        kfree(c->rcvrd_mst_node);
+        if (c->bgt)
+                kthread_stop(c->bgt);
+out_wbufs:
+        free_wbufs(c);
+out_cbuf:
+        kfree(c->cbuf);
+out_dereg:
+        dbg_failure_mode_deregistration(c);
+out_free:
+        vfree(c->ileb_buf);
+        vfree(c->sbuf);
+        kfree(c->bottom_up_buf);
+        UBIFS_DBG(vfree(c->dbg_buf));
+        return err;
+}
+/**
+ * ubifs_umount - un-mount UBIFS file-system.
+ * @c: UBIFS file-system description object
+ *
+ * Note, this function is called to free allocated resourced when un-mounting,
+ * as well as free resources when an error occurred while we were half way
+ * through mounting (error path cleanup function). So it has to make sure the
+ * resource was actually allocated before freeing it.
+ */
+static void ubifs_umount(struct ubifs_info *c)
+{
+        dbg_gen("un-mounting UBI device %d, volume %d", c->vi.ubi_num,
+                c->vi.vol_id);
+        spin_lock(&ubifs_infos_lock);
+        list_del(&c->infos_list);
+        spin_unlock(&ubifs_infos_lock);
+        if (c->bgt)
+                kthread_stop(c->bgt);
+        destroy_journal(c);
+        free_wbufs(c);
+        free_orphans(c);
+        ubifs_lpt_free(c, 0);
+        kfree(c->cbuf);
+        kfree(c->rcvrd_mst_node);
+        kfree(c->mst_node);
+        vfree(c->sbuf);
+        kfree(c->bottom_up_buf);
+        UBIFS_DBG(vfree(c->dbg_buf));
+        vfree(c->ileb_buf);
+        dbg_failure_mode_deregistration(c);
+}
+/**
+ * ubifs_remount_rw - re-mount in read-write mode.
+ * @c: UBIFS file-system description object
+ *
+ * UBIFS avoids allocating many unnecessary resources when mounted in read-only
+ * mode. This function allocates the needed resources and re-mounts UBIFS in
+ * read-write mode.
+ */
+static int ubifs_remount_rw(struct ubifs_info *c)
+{
+        int err, lnum;
+        if (c->ro_media)
+                return -EINVAL;
+        mutex_lock(&c->umount_mutex);
+        c->remounting_rw = 1;
+        /* Check for enough free space */
+        if (ubifs_calc_available(c, c->min_idx_lebs) <= 0) {
+                ubifs_err("insufficient available space");
+                err = -EINVAL;
+                goto out;
+        }
+        if (c->old_leb_cnt != c->leb_cnt) {
+                struct ubifs_sb_node *sup;
+                sup = ubifs_read_sb_node(c);
+                if (IS_ERR(sup)) {
+                        err = PTR_ERR(sup);
+                        goto out;
+                }
+                sup->leb_cnt = cpu_to_le32(c->leb_cnt);
+                err = ubifs_write_sb_node(c, sup);
+                if (err)
+                        goto out;
+        }
+        if (c->need_recovery) {
+                ubifs_msg("completing deferred recovery");
+                err = ubifs_write_rcvrd_mst_node(c);
+                if (err)
+                        goto out;
+                err = ubifs_recover_size(c);
+                if (err)
+                        goto out;
+                err = ubifs_clean_lebs(c, c->sbuf);
+                if (err)
+                        goto out;
+                err = ubifs_recover_inl_heads(c, c->sbuf);
+                if (err)
+                        goto out;
+        }
+        if (!(c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY))) {
+                c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY);
+                err = ubifs_write_master(c);
+                if (err)
+                        goto out;
+        }
+        c->ileb_buf = vmalloc(c->leb_size);
+        if (!c->ileb_buf) {
+                err = -ENOMEM;
+                goto out;
+        }
+        err = ubifs_lpt_init(c, 0, 1);
+        if (err)
+                goto out;
+        err = alloc_wbufs(c);
+        if (err)
+                goto out;
+        ubifs_create_buds_lists(c);
+        /* Create background thread */
+        c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name);
+        if (!c->bgt)
+                c->bgt = ERR_PTR(-EINVAL);
+        if (IS_ERR(c->bgt)) {
+                err = PTR_ERR(c->bgt);
+                c->bgt = NULL;
+                ubifs_err("cannot spawn \"%s\", error %d",
+                          c->bgt_name, err);
+                return err;
+        }
+        wake_up_process(c->bgt);
+        c->orph_buf = vmalloc(c->leb_size);
+        if (!c->orph_buf)
+                return -ENOMEM;
+        /* Check for enough log space */
+        lnum = c->lhead_lnum + 1;
+        if (lnum >= UBIFS_LOG_LNUM + c->log_lebs)
+                lnum = UBIFS_LOG_LNUM;
+        if (lnum == c->ltail_lnum) {
+                err = ubifs_consolidate_log(c);
+                if (err)
+                        goto out;
+        }
+        if (c->need_recovery)
+                err = ubifs_rcvry_gc_commit(c);
+        else
+                err = take_gc_lnum(c);
+        if (err)
+                goto out;
+        if (c->need_recovery) {
+                c->need_recovery = 0;
+                ubifs_msg("deferred recovery completed");
+        }
+        dbg_gen("re-mounted read-write");
+        c->vfs_sb->s_flags &= ~MS_RDONLY;
+        c->remounting_rw = 0;
+        mutex_unlock(&c->umount_mutex);
+        return 0;
+out:
+        vfree(c->orph_buf);
+        c->orph_buf = NULL;
+        if (c->bgt) {
+                kthread_stop(c->bgt);
+                c->bgt = NULL;
+        }
+        free_wbufs(c);
+        vfree(c->ileb_buf);
+        c->ileb_buf = NULL;
+        ubifs_lpt_free(c, 1);
+        c->remounting_rw = 0;
+        mutex_unlock(&c->umount_mutex);
+        return err;
+}
+/**
+ * commit_on_unmount - commit the journal when un-mounting.
+ * @c: UBIFS file-system description object
+ *
+ * This function is called during un-mounting and it commits the journal unless
+ * the "fast unmount" mode is enabled. It also avoids committing the journal if
+ * it contains too few data.
+ *
+ * Sometimes recovery requires the journal to be committed at least once, and
+ * this function takes care about this.
+ */
+static void commit_on_unmount(struct ubifs_info *c)
+{
+        if (!c->fast_unmount) {
+                long long bud_bytes;
+                spin_lock(&c->buds_lock);
+                bud_bytes = c->bud_bytes;
+                spin_unlock(&c->buds_lock);
+                if (bud_bytes > c->leb_size)
+                        ubifs_run_commit(c);
+        }
+}
+/**
+ * ubifs_remount_ro - re-mount in read-only mode.
+ * @c: UBIFS file-system description object
+ *
+ * We rely on VFS to have stopped writing. Possibly the background thread could
+ * be running a commit, however kthread_stop will wait in that case.
+ */
+static void ubifs_remount_ro(struct ubifs_info *c)
+{
+        int i, err;
+        ubifs_assert(!c->need_recovery);
+        commit_on_unmount(c);
+        mutex_lock(&c->umount_mutex);
+        if (c->bgt) {
+                kthread_stop(c->bgt);
+                c->bgt = NULL;
+        }
+        for (i = 0; i < c->jhead_cnt; i++) {
+                ubifs_wbuf_sync(&c->jheads[i].wbuf);
+                del_timer_sync(&c->jheads[i].wbuf.timer);
+        }
+        if (!c->ro_media) {
+                c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
+                c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
+                c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum);
+                err = ubifs_write_master(c);
+                if (err)
+                        ubifs_ro_mode(c, err);
+        }
+        ubifs_destroy_idx_gc(c);
+        free_wbufs(c);
+        vfree(c->orph_buf);
+        c->orph_buf = NULL;
+        vfree(c->ileb_buf);
+        c->ileb_buf = NULL;
+        ubifs_lpt_free(c, 1);
+        mutex_unlock(&c->umount_mutex);
+}
+static void ubifs_put_super(struct super_block *sb)
+{
+        int i;
+        struct ubifs_info *c = sb->s_fs_info;
+        ubifs_msg("un-mount UBI device %d, volume %d", c->vi.ubi_num,
+                  c->vi.vol_id);
+        /*
+         * The following asserts are only valid if there has not been a failure
+         * of the media. For example, there will be dirty inodes if we failed
+         * to write them back because of I/O errors.
+         */
+        ubifs_assert(atomic_long_read(&c->dirty_pg_cnt) == 0);
+        ubifs_assert(c->budg_idx_growth == 0);
+        ubifs_assert(c->budg_data_growth == 0);
+        /*
+         * The 'c->umount_lock' prevents races between UBIFS memory shrinker
+         * and file system un-mount. Namely, it prevents the shrinker from
+         * picking this superblock for shrinking - it will be just skipped if
+         * the mutex is locked.
+         */
+        mutex_lock(&c->umount_mutex);
+        if (!(c->vfs_sb->s_flags & MS_RDONLY)) {
+                /*
+                 * First of all kill the background thread to make sure it does
+                 * not interfere with un-mounting and freeing resources.
+                 */
+                if (c->bgt) {
+                        kthread_stop(c->bgt);
+                        c->bgt = NULL;
+                }
+                /* Synchronize write-buffers */
+                if (c->jheads)
+                        for (i = 0; i < c->jhead_cnt; i++) {
+                                ubifs_wbuf_sync(&c->jheads[i].wbuf);
+                                del_timer_sync(&c->jheads[i].wbuf.timer);
+                        }
+                /*
+                 * On fatal errors c->ro_media is set to 1, in which case we do
+                 * not write the master node.
+                 */
+                if (!c->ro_media) {
+                        /*
+                         * We are being cleanly unmounted which means the
+                         * orphans were killed - indicate this in the master
+                         * node. Also save the reserved GC LEB number.
+                         */
+                        int err;
+                        c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
+                        c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
+                        c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum);
+                        err = ubifs_write_master(c);
+                        if (err)
+                                /*
+                                 * Recovery will attempt to fix the master area
+                                 * next mount, so we just print a message and
+                                 * continue to unmount normally.
+                                 */
+                                ubifs_err("failed to write master node, "
+                                          "error %d", err);
+                }
+        }
+        ubifs_umount(c);
+        bdi_destroy(&c->bdi);
+        ubi_close_volume(c->ubi);
+        mutex_unlock(&c->umount_mutex);
+        kfree(c);
+}
+static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+        int err;
+        struct ubifs_info *c = sb->s_fs_info;
+        dbg_gen("old flags %#lx, new flags %#x", sb->s_flags, *flags);
+        err = ubifs_parse_options(c, data, 1);
+        if (err) {
+                ubifs_err("invalid or unknown remount parameter");
+                return err;
+        }
+        if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
+                err = ubifs_remount_rw(c);
+                if (err)
+                        return err;
+        } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY))
+                ubifs_remount_ro(c);
+        return 0;
+}
+struct super_operations ubifs_super_operations = {
+        .alloc_inode   = ubifs_alloc_inode,
+        .destroy_inode = ubifs_destroy_inode,
+        .put_super     = ubifs_put_super,
+        .write_inode   = ubifs_write_inode,
+        .delete_inode  = ubifs_delete_inode,
+        .statfs        = ubifs_statfs,
+        .dirty_inode   = ubifs_dirty_inode,
+        .remount_fs    = ubifs_remount_fs,
+        .show_options  = ubifs_show_options,
+        .sync_fs       = ubifs_sync_fs,
+};
+/**
+ * open_ubi - parse UBI device name string and open the UBI device.
+ * @name: UBI volume name
+ * @mode: UBI volume open mode
+ *
+ * There are several ways to specify UBI volumes when mounting UBIFS:
+ * o ubiX_Y    - UBI device number X, volume Y;
+ * o ubiY      - UBI device number 0, volume Y;
+ * o ubiX:NAME - mount UBI device X, volume with name NAME;
+ * o ubi:NAME  - mount UBI device 0, volume with name NAME.
+ *
+ * Alternative '!' separator may be used instead of ':' (because some shells
+ * like busybox may interpret ':' as an NFS host name separator). This function
+ * returns ubi volume object in case of success and a negative error code in
+ * case of failure.
+ */
+static struct ubi_volume_desc *open_ubi(const char *name, int mode)
+{
+        int dev, vol;
+        char *endptr;
+        if (name[0] != 'u' || name[1] != 'b' || name[2] != 'i')
+                return ERR_PTR(-EINVAL);
+        /* ubi:NAME method */
+        if ((name[3] == ':' || name[3] == '!') && name[4] != '\0')
+                return ubi_open_volume_nm(0, name + 4, mode);
+        if (!isdigit(name[3]))
+                return ERR_PTR(-EINVAL);
+        dev = simple_strtoul(name + 3, &endptr, 0);
+        /* ubiY method */
+        if (*endptr == '\0')
+                return ubi_open_volume(0, dev, mode);
+        /* ubiX_Y method */
+        if (*endptr == '_' && isdigit(endptr[1])) {
+                vol = simple_strtoul(endptr + 1, &endptr, 0);
+                if (*endptr != '\0')
+                        return ERR_PTR(-EINVAL);
+                return ubi_open_volume(dev, vol, mode);
+        }
+        /* ubiX:NAME method */
+        if ((*endptr == ':' || *endptr == '!') && endptr[1] != '\0')
+                return ubi_open_volume_nm(dev, ++endptr, mode);
+        return ERR_PTR(-EINVAL);
+}
+static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
+{
+        struct ubi_volume_desc *ubi = sb->s_fs_info;
+        struct ubifs_info *c;
+        struct inode *root;
+        int err;
+        c = kzalloc(sizeof(struct ubifs_info), GFP_KERNEL);
+        if (!c)
+                return -ENOMEM;
+        spin_lock_init(&c->cnt_lock);
+        spin_lock_init(&c->cs_lock);
+        spin_lock_init(&c->buds_lock);
+        spin_lock_init(&c->space_lock);
+        spin_lock_init(&c->orphan_lock);
+        init_rwsem(&c->commit_sem);
+        mutex_init(&c->lp_mutex);
+        mutex_init(&c->tnc_mutex);
+        mutex_init(&c->log_mutex);
+        mutex_init(&c->mst_mutex);
+        mutex_init(&c->umount_mutex);
+        init_waitqueue_head(&c->cmt_wq);
+        c->buds = RB_ROOT;
+        c->old_idx = RB_ROOT;
+        c->size_tree = RB_ROOT;
+        c->orph_tree = RB_ROOT;
+        INIT_LIST_HEAD(&c->infos_list);
+        INIT_LIST_HEAD(&c->idx_gc);
+        INIT_LIST_HEAD(&c->replay_list);
+        INIT_LIST_HEAD(&c->replay_buds);
+        INIT_LIST_HEAD(&c->uncat_list);
+        INIT_LIST_HEAD(&c->empty_list);
+        INIT_LIST_HEAD(&c->freeable_list);
+        INIT_LIST_HEAD(&c->frdi_idx_list);
+        INIT_LIST_HEAD(&c->unclean_leb_list);
+        INIT_LIST_HEAD(&c->old_buds);
+        INIT_LIST_HEAD(&c->orph_list);
+        INIT_LIST_HEAD(&c->orph_new);
+        c->highest_inum = UBIFS_FIRST_INO;
+        get_random_bytes(&c->vfs_gen, sizeof(int));
+        c->lhead_lnum = c->ltail_lnum = UBIFS_LOG_LNUM;
+        ubi_get_volume_info(ubi, &c->vi);
+        ubi_get_device_info(c->vi.ubi_num, &c->di);
+        /* Re-open the UBI device in read-write mode */
+        c->ubi = ubi_open_volume(c->vi.ubi_num, c->vi.vol_id, UBI_READWRITE);
+        if (IS_ERR(c->ubi)) {
+                err = PTR_ERR(c->ubi);
+                goto out_free;
+        }
+        /*
+         * UBIFS provids 'backing_dev_info' in order to disable readahead. For
+         * UBIFS, I/O is not deferred, it is done immediately in readpage,
+         * which means the user would have to wait not just for their own I/O
+         * but the readahead I/O as well i.e. completely pointless.
+         *
+         * Read-ahead will be disabled because @c->bdi.ra_pages is 0.
+         */
+        c->bdi.capabilities = BDI_CAP_MAP_COPY;
+        c->bdi.unplug_io_fn = default_unplug_io_fn;
+        err  = bdi_init(&c->bdi);
+        if (err)
+                goto out_close;
+        err = ubifs_parse_options(c, data, 0);
+        if (err)
+                goto out_bdi;
+        c->vfs_sb = sb;
+        sb->s_fs_info = c;
+        sb->s_magic = UBIFS_SUPER_MAGIC;
+        sb->s_blocksize = UBIFS_BLOCK_SIZE;
+        sb->s_blocksize_bits = UBIFS_BLOCK_SHIFT;
+        sb->s_dev = c->vi.cdev;
+        sb->s_maxbytes = c->max_inode_sz = key_max_inode_size(c);
+        if (c->max_inode_sz > MAX_LFS_FILESIZE)
+                sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE;
+        sb->s_op = &ubifs_super_operations;
+        mutex_lock(&c->umount_mutex);
+        err = mount_ubifs(c);
+        if (err) {
+                ubifs_assert(err < 0);
+                goto out_unlock;
+        }
+        /* Read the root inode */
+        root = ubifs_iget(sb, UBIFS_ROOT_INO);
+        if (IS_ERR(root)) {
+                err = PTR_ERR(root);
+                goto out_umount;
+        }
+        sb->s_root = d_alloc_root(root);
+        if (!sb->s_root)
+                goto out_iput;
+        mutex_unlock(&c->umount_mutex);
+        return 0;
+out_iput:
+        iput(root);
+out_umount:
+        ubifs_umount(c);
+out_unlock:
+        mutex_unlock(&c->umount_mutex);
+out_bdi:
+        bdi_destroy(&c->bdi);
+out_close:
+        ubi_close_volume(c->ubi);
+out_free:
+        kfree(c);
+        return err;
+}
+static int sb_test(struct super_block *sb, void *data)
+{
+        dev_t *dev = data;
+        return sb->s_dev == *dev;
+}
+static int sb_set(struct super_block *sb, void *data)
+{
+        dev_t *dev = data;
+        sb->s_dev = *dev;
+        return 0;
+}
+static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
+                        const char *name, void *data, struct vfsmount *mnt)
+{
+        struct ubi_volume_desc *ubi;
+        struct ubi_volume_info vi;
+        struct super_block *sb;
+        int err;
+        dbg_gen("name %s, flags %#x", name, flags);
+        /*
+         * Get UBI device number and volume ID. Mount it read-only so far
+         * because this might be a new mount point, and UBI allows only one
+         * read-write user at a time.
+         */
+        ubi = open_ubi(name, UBI_READONLY);
+        if (IS_ERR(ubi)) {
+                ubifs_err("cannot open \"%s\", error %d",
+                          name, (int)PTR_ERR(ubi));
+                return PTR_ERR(ubi);
+        }
+        ubi_get_volume_info(ubi, &vi);
+        dbg_gen("opened ubi%d_%d", vi.ubi_num, vi.vol_id);
+        sb = sget(fs_type, &sb_test, &sb_set, &vi.cdev);
+        if (IS_ERR(sb)) {
+                err = PTR_ERR(sb);
+                goto out_close;
+        }
+        if (sb->s_root) {
+                /* A new mount point for already mounted UBIFS */
+                dbg_gen("this ubi volume is already mounted");
+                if ((flags ^ sb->s_flags) & MS_RDONLY) {
+                        err = -EBUSY;
+                        goto out_deact;
+                }
+        } else {
+                sb->s_flags = flags;
+                /*
+                 * Pass 'ubi' to 'fill_super()' in sb->s_fs_info where it is
+                 * replaced by 'c'.
+                 */
+                sb->s_fs_info = ubi;
+                err = ubifs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
+                if (err)
+                        goto out_deact;
+                /* We do not support atime */
+                sb->s_flags |= MS_ACTIVE | MS_NOATIME;
+        }
+        /* 'fill_super()' opens ubi again so we must close it here */
+        ubi_close_volume(ubi);
+        return simple_set_mnt(mnt, sb);
+out_deact:
+        up_write(&sb->s_umount);
+        deactivate_super(sb);
+out_close:
+        ubi_close_volume(ubi);
+        return err;
+}
+static void ubifs_kill_sb(struct super_block *sb)
+{
+        struct ubifs_info *c = sb->s_fs_info;
+        /*
+         * We do 'commit_on_unmount()' here instead of 'ubifs_put_super()'
+         * in order to be outside BKL.
+         */
+        if (sb->s_root && !(sb->s_flags & MS_RDONLY))
+                commit_on_unmount(c);
+        /* The un-mount routine is actually done in put_super() */
+        generic_shutdown_super(sb);
+}
+static struct file_system_type ubifs_fs_type = {
+        .name    = "ubifs",
+        .owner   = THIS_MODULE,
+        .get_sb  = ubifs_get_sb,
+        .kill_sb = ubifs_kill_sb
+};
+/*
+ * Inode slab cache constructor.
+ */
+static void inode_slab_ctor(struct kmem_cache *cachep, void *obj)
+{
+        struct ubifs_inode *ui = obj;
+        inode_init_once(&ui->vfs_inode);
+}
+static int __init ubifs_init(void)
+{
+        int err;
+        BUILD_BUG_ON(sizeof(struct ubifs_ch) != 24);
+        /* Make sure node sizes are 8-byte aligned */
+        BUILD_BUG_ON(UBIFS_CH_SZ        & 7);
+        BUILD_BUG_ON(UBIFS_INO_NODE_SZ  & 7);
+        BUILD_BUG_ON(UBIFS_DENT_NODE_SZ & 7);
+        BUILD_BUG_ON(UBIFS_XENT_NODE_SZ & 7);
+        BUILD_BUG_ON(UBIFS_DATA_NODE_SZ & 7);
+        BUILD_BUG_ON(UBIFS_TRUN_NODE_SZ & 7);
+        BUILD_BUG_ON(UBIFS_SB_NODE_SZ   & 7);
+        BUILD_BUG_ON(UBIFS_MST_NODE_SZ  & 7);
+        BUILD_BUG_ON(UBIFS_REF_NODE_SZ  & 7);
+        BUILD_BUG_ON(UBIFS_CS_NODE_SZ   & 7);
+        BUILD_BUG_ON(UBIFS_ORPH_NODE_SZ & 7);
+        BUILD_BUG_ON(UBIFS_MAX_DENT_NODE_SZ & 7);
+        BUILD_BUG_ON(UBIFS_MAX_XENT_NODE_SZ & 7);
+        BUILD_BUG_ON(UBIFS_MAX_DATA_NODE_SZ & 7);
+        BUILD_BUG_ON(UBIFS_MAX_INO_NODE_SZ  & 7);
+        BUILD_BUG_ON(UBIFS_MAX_NODE_SZ      & 7);
+        BUILD_BUG_ON(MIN_WRITE_SZ           & 7);
+        /* Check min. node size */
+        BUILD_BUG_ON(UBIFS_INO_NODE_SZ  < MIN_WRITE_SZ);
+        BUILD_BUG_ON(UBIFS_DENT_NODE_SZ < MIN_WRITE_SZ);
+        BUILD_BUG_ON(UBIFS_XENT_NODE_SZ < MIN_WRITE_SZ);
+        BUILD_BUG_ON(UBIFS_TRUN_NODE_SZ < MIN_WRITE_SZ);
+        BUILD_BUG_ON(UBIFS_MAX_DENT_NODE_SZ > UBIFS_MAX_NODE_SZ);
+        BUILD_BUG_ON(UBIFS_MAX_XENT_NODE_SZ > UBIFS_MAX_NODE_SZ);
+        BUILD_BUG_ON(UBIFS_MAX_DATA_NODE_SZ > UBIFS_MAX_NODE_SZ);
+        BUILD_BUG_ON(UBIFS_MAX_INO_NODE_SZ  > UBIFS_MAX_NODE_SZ);
+        /* Defined node sizes */
+        BUILD_BUG_ON(UBIFS_SB_NODE_SZ  != 4096);
+        BUILD_BUG_ON(UBIFS_MST_NODE_SZ != 512);
+        BUILD_BUG_ON(UBIFS_INO_NODE_SZ != 160);
+        BUILD_BUG_ON(UBIFS_REF_NODE_SZ != 64);
+        /*
+         * We require that PAGE_CACHE_SIZE is greater-than-or-equal-to
+         * UBIFS_BLOCK_SIZE. It is assumed that both are powers of 2.
+         */
+        if (PAGE_CACHE_SIZE < UBIFS_BLOCK_SIZE) {
+                ubifs_err("VFS page cache size is %u bytes, but UBIFS requires"
+                          " at least 4096 bytes",
+                          (unsigned int)PAGE_CACHE_SIZE);
+                return -EINVAL;
+        }
+        err = register_filesystem(&ubifs_fs_type);
+        if (err) {
+                ubifs_err("cannot register file system, error %d", err);
+                return err;
+        }
+        err = -ENOMEM;
+        ubifs_inode_slab = kmem_cache_create("ubifs_inode_slab",
+                                sizeof(struct ubifs_inode), 0,
+                                SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT,
+                                &inode_slab_ctor);
+        if (!ubifs_inode_slab)
+                goto out_reg;
+        register_shrinker(&ubifs_shrinker_info);
+        err = ubifs_compressors_init();
+        if (err)
+                goto out_compr;
+        return 0;
+out_compr:
+        unregister_shrinker(&ubifs_shrinker_info);
+        kmem_cache_destroy(ubifs_inode_slab);
+out_reg:
+        unregister_filesystem(&ubifs_fs_type);
+        return err;
+}
+/* late_initcall to let compressors initialize first */
+late_initcall(ubifs_init);
+static void __exit ubifs_exit(void)
+{
+        ubifs_assert(list_empty(&ubifs_infos));
+        ubifs_assert(atomic_long_read(&ubifs_clean_zn_cnt) == 0);
+        ubifs_compressors_exit();
+        unregister_shrinker(&ubifs_shrinker_info);
+        kmem_cache_destroy(ubifs_inode_slab);
+        unregister_filesystem(&ubifs_fs_type);
+}
+module_exit(ubifs_exit);
+MODULE_LICENSE("GPL");
+MODULE_VERSION(__stringify(UBIFS_VERSION));
+MODULE_AUTHOR("Artem Bityutskiy, Adrian Hunter");
+MODULE_DESCRIPTION("UBIFS - UBI File System");
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
new file mode 100644
index 000000000000..e909f4a96443
--- /dev/null
+++ b/fs/ubifs/tnc.c
@@ -0,0 +1,2956 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+/*
+ * This file implements TNC (Tree Node Cache) which caches indexing nodes of
+ * the UBIFS B-tree.
+ *
+ * At the moment the locking rules of the TNC tree are quite simple and
+ * straightforward. We just have a mutex and lock it when we traverse the
+ * tree. If a znode is not in memory, we read it from flash while still having
+ * the mutex locked.
+ */
+#include <linux/crc32.h>
+#include "ubifs.h"
+/*
+ * Returned codes of 'matches_name()' and 'fallible_matches_name()' functions.
+ * @NAME_LESS: name corresponding to the first argument is less than second
+ * @NAME_MATCHES: names match
+ * @NAME_GREATER: name corresponding to the second argument is greater than
+ *                first
+ * @NOT_ON_MEDIA: node referred by zbranch does not exist on the media
+ *
+ * These constants were introduce to improve readability.
+ */
+enum {
+        NAME_LESS    = 0,
+        NAME_MATCHES = 1,
+        NAME_GREATER = 2,
+        NOT_ON_MEDIA = 3,
+};
+/**
+ * insert_old_idx - record an index node obsoleted since the last commit start.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number of obsoleted index node
+ * @offs: offset of obsoleted index node
+ *
+ * Returns %0 on success, and a negative error code on failure.
+ *
+ * For recovery, there must always be a complete intact version of the index on
+ * flash at all times. That is called the "old index". It is the index as at the
+ * time of the last successful commit. Many of the index nodes in the old index
+ * may be dirty, but they must not be erased until the next successful commit
+ * (at which point that index becomes the old index).
+ *
+ * That means that the garbage collection and the in-the-gaps method of
+ * committing must be able to determine if an index node is in the old index.
+ * Most of the old index nodes can be found by looking up the TNC using the
+ * 'lookup_znode()' function. However, some of the old index nodes may have
+ * been deleted from the current index or may have been changed so much that
+ * they cannot be easily found. In those cases, an entry is added to an RB-tree.
+ * That is what this function does. The RB-tree is ordered by LEB number and
+ * offset because they uniquely identify the old index node.
+ */
+static int insert_old_idx(struct ubifs_info *c, int lnum, int offs)
+{
+        struct ubifs_old_idx *old_idx, *o;
+        struct rb_node **p, *parent = NULL;
+        old_idx = kmalloc(sizeof(struct ubifs_old_idx), GFP_NOFS);
+        if (unlikely(!old_idx))
+                return -ENOMEM;
+        old_idx->lnum = lnum;
+        old_idx->offs = offs;
+        p = &c->old_idx.rb_node;
+        while (*p) {
+                parent = *p;
+                o = rb_entry(parent, struct ubifs_old_idx, rb);
+                if (lnum < o->lnum)
+                        p = &(*p)->rb_left;
+                else if (lnum > o->lnum)
+                        p = &(*p)->rb_right;
+                else if (offs < o->offs)
+                        p = &(*p)->rb_left;
+                else if (offs > o->offs)
+                        p = &(*p)->rb_right;
+                else {
+                        ubifs_err("old idx added twice!");
+                        kfree(old_idx);
+                        return 0;
+                }
+        }
+        rb_link_node(&old_idx->rb, parent, p);
+        rb_insert_color(&old_idx->rb, &c->old_idx);
+        return 0;
+}
+/**
+ * insert_old_idx_znode - record a znode obsoleted since last commit start.
+ * @c: UBIFS file-system description object
+ * @znode: znode of obsoleted index node
+ *
+ * Returns %0 on success, and a negative error code on failure.
+ */
+int insert_old_idx_znode(struct ubifs_info *c, struct ubifs_znode *znode)
+{
+        if (znode->parent) {
+                struct ubifs_zbranch *zbr;
+                zbr = &znode->parent->zbranch[znode->iip];
+                if (zbr->len)
+                        return insert_old_idx(c, zbr->lnum, zbr->offs);
+        } else
+                if (c->zroot.len)
+                        return insert_old_idx(c, c->zroot.lnum,
+                                              c->zroot.offs);
+        return 0;
+}
+/**
+ * ins_clr_old_idx_znode - record a znode obsoleted since last commit start.
+ * @c: UBIFS file-system description object
+ * @znode: znode of obsoleted index node
+ *
+ * Returns %0 on success, and a negative error code on failure.
+ */
+static int ins_clr_old_idx_znode(struct ubifs_info *c,
+                                 struct ubifs_znode *znode)
+{
+        int err;
+        if (znode->parent) {
+                struct ubifs_zbranch *zbr;
+                zbr = &znode->parent->zbranch[znode->iip];
+                if (zbr->len) {
+                        err = insert_old_idx(c, zbr->lnum, zbr->offs);
+                        if (err)
+                                return err;
+                        zbr->lnum = 0;
+                        zbr->offs = 0;
+                        zbr->len = 0;
+                }
+        } else
+                if (c->zroot.len) {
+                        err = insert_old_idx(c, c->zroot.lnum, c->zroot.offs);
+                        if (err)
+                                return err;
+                        c->zroot.lnum = 0;
+                        c->zroot.offs = 0;
+                        c->zroot.len = 0;
+                }
+        return 0;
+}
+/**
+ * destroy_old_idx - destroy the old_idx RB-tree.
+ * @c: UBIFS file-system description object
+ *
+ * During start commit, the old_idx RB-tree is used to avoid overwriting index
+ * nodes that were in the index last commit but have since been deleted.  This
+ * is necessary for recovery i.e. the old index must be kept intact until the
+ * new index is successfully written.  The old-idx RB-tree is used for the
+ * in-the-gaps method of writing index nodes and is destroyed every commit.
+ */
+void destroy_old_idx(struct ubifs_info *c)
+{
+        struct rb_node *this = c->old_idx.rb_node;
+        struct ubifs_old_idx *old_idx;
+        while (this) {
+                if (this->rb_left) {
+                        this = this->rb_left;
+                        continue;
+                } else if (this->rb_right) {
+                        this = this->rb_right;
+                        continue;
+                }
+                old_idx = rb_entry(this, struct ubifs_old_idx, rb);
+                this = rb_parent(this);
+                if (this) {
+                        if (this->rb_left == &old_idx->rb)
+                                this->rb_left = NULL;
+                        else
+                                this->rb_right = NULL;
+                }
+                kfree(old_idx);
+        }
+        c->old_idx = RB_ROOT;
+}
+/**
+ * copy_znode - copy a dirty znode.
+ * @c: UBIFS file-system description object
+ * @znode: znode to copy
+ *
+ * A dirty znode being committed may not be changed, so it is copied.
+ */
+static struct ubifs_znode *copy_znode(struct ubifs_info *c,
+                                      struct ubifs_znode *znode)
+{
+        struct ubifs_znode *zn;
+        zn = kmalloc(c->max_znode_sz, GFP_NOFS);
+        if (unlikely(!zn))
+                return ERR_PTR(-ENOMEM);
+        memcpy(zn, znode, c->max_znode_sz);
+        zn->cnext = NULL;
+        __set_bit(DIRTY_ZNODE, &zn->flags);
+        __clear_bit(COW_ZNODE, &zn->flags);
+        ubifs_assert(!test_bit(OBSOLETE_ZNODE, &znode->flags));
+        __set_bit(OBSOLETE_ZNODE, &znode->flags);
+        if (znode->level != 0) {
+                int i;
+                const int n = zn->child_cnt;
+                /* The children now have new parent */
+                for (i = 0; i < n; i++) {
+                        struct ubifs_zbranch *zbr = &zn->zbranch[i];
+                        if (zbr->znode)
+                                zbr->znode->parent = zn;
+                }
+        }
+        atomic_long_inc(&c->dirty_zn_cnt);
+        return zn;
+}
+/**
+ * add_idx_dirt - add dirt due to a dirty znode.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number of index node
+ * @dirt: size of index node
+ *
+ * This function updates lprops dirty space and the new size of the index.
+ */
+static int add_idx_dirt(struct ubifs_info *c, int lnum, int dirt)
+{
+        c->calc_idx_sz -= ALIGN(dirt, 8);
+        return ubifs_add_dirt(c, lnum, dirt);
+}
+/**
+ * dirty_cow_znode - ensure a znode is not being committed.
+ * @c: UBIFS file-system description object
+ * @zbr: branch of znode to check
+ *
+ * Returns dirtied znode on success or negative error code on failure.
+ */
+static struct ubifs_znode *dirty_cow_znode(struct ubifs_info *c,
+                                           struct ubifs_zbranch *zbr)
+{
+        struct ubifs_znode *znode = zbr->znode;
+        struct ubifs_znode *zn;
+        int err;
+        if (!test_bit(COW_ZNODE, &znode->flags)) {
+                /* znode is not being committed */
+                if (!test_and_set_bit(DIRTY_ZNODE, &znode->flags)) {
+                        atomic_long_inc(&c->dirty_zn_cnt);
+                        atomic_long_dec(&c->clean_zn_cnt);
+                        atomic_long_dec(&ubifs_clean_zn_cnt);
+                        err = add_idx_dirt(c, zbr->lnum, zbr->len);
+                        if (unlikely(err))
+                                return ERR_PTR(err);
+                }
+                return znode;
+        }
+        zn = copy_znode(c, znode);
+        if (unlikely(IS_ERR(zn)))
+                return zn;
+        if (zbr->len) {
+                err = insert_old_idx(c, zbr->lnum, zbr->offs);
+                if (unlikely(err))
+                        return ERR_PTR(err);
+                err = add_idx_dirt(c, zbr->lnum, zbr->len);
+        } else
+                err = 0;
+        zbr->znode = zn;
+        zbr->lnum = 0;
+        zbr->offs = 0;
+        zbr->len = 0;
+        if (unlikely(err))
+                return ERR_PTR(err);
+        return zn;
+}
+/**
+ * lnc_add - add a leaf node to the leaf node cache.
+ * @c: UBIFS file-system description object
+ * @zbr: zbranch of leaf node
+ * @node: leaf node
+ *
+ * Leaf nodes are non-index nodes directory entry nodes or data nodes. The
+ * purpose of the leaf node cache is to save re-reading the same leaf node over
+ * and over again. Most things are cached by VFS, however the file system must
+ * cache directory entries for readdir and for resolving hash collisions. The
+ * present implementation of the leaf node cache is extremely simple, and
+ * allows for error returns that are not used but that may be needed if a more
+ * complex implementation is created.
+ *
+ * Note, this function does not add the @node object to LNC directly, but
+ * allocates a copy of the object and adds the copy to LNC. The reason for this
+ * is that @node has been allocated outside of the TNC subsystem and will be
+ * used with @c->tnc_mutex unlock upon return from the TNC subsystem. But LNC
+ * may be changed at any time, e.g. freed by the shrinker.
+ */
+static int lnc_add(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+                   const void *node)
+{
+        int err;
+        void *lnc_node;
+        const struct ubifs_dent_node *dent = node;
+        ubifs_assert(!zbr->leaf);
+        ubifs_assert(zbr->len != 0);
+        ubifs_assert(is_hash_key(c, &zbr->key));
+        err = ubifs_validate_entry(c, dent);
+        if (err) {
+                dbg_dump_stack();
+                dbg_dump_node(c, dent);
+                return err;
+        }
+        lnc_node = kmalloc(zbr->len, GFP_NOFS);
+        if (!lnc_node)
+                /* We don't have to have the cache, so no error */
+                return 0;
+        memcpy(lnc_node, node, zbr->len);
+        zbr->leaf = lnc_node;
+        return 0;
+}
+ /**
+ * lnc_add_directly - add a leaf node to the leaf-node-cache.
+ * @c: UBIFS file-system description object
+ * @zbr: zbranch of leaf node
+ * @node: leaf node
+ *
+ * This function is similar to 'lnc_add()', but it does not create a copy of
+ * @node but inserts @node to TNC directly.
+ */
+static int lnc_add_directly(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+                            void *node)
+{
+        int err;
+        ubifs_assert(!zbr->leaf);
+        ubifs_assert(zbr->len != 0);
+        err = ubifs_validate_entry(c, node);
+        if (err) {
+                dbg_dump_stack();
+                dbg_dump_node(c, node);
+                return err;
+        }
+        zbr->leaf = node;
+        return 0;
+}
+/**
+ * lnc_free - remove a leaf node from the leaf node cache.
+ * @zbr: zbranch of leaf node
+ * @node: leaf node
+ */
+static void lnc_free(struct ubifs_zbranch *zbr)
+{
+        if (!zbr->leaf)
+                return;
+        kfree(zbr->leaf);
+        zbr->leaf = NULL;
+}
+/**
+ * tnc_read_node_nm - read a "hashed" leaf node.
+ * @c: UBIFS file-system description object
+ * @zbr: key and position of the node
+ * @node: node is returned here
+ *
+ * This function reads a "hashed" node defined by @zbr from the leaf node cache
+ * (in it is there) or from the hash media, in which case the node is also
+ * added to LNC. Returns zero in case of success or a negative negative error
+ * code in case of failure.
+ */
+static int tnc_read_node_nm(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+                            void *node)
+{
+        int err;
+        ubifs_assert(is_hash_key(c, &zbr->key));
+        if (zbr->leaf) {
+                /* Read from the leaf node cache */
+                ubifs_assert(zbr->len != 0);
+                memcpy(node, zbr->leaf, zbr->len);
+                return 0;
+        }
+        err = ubifs_tnc_read_node(c, zbr, node);
+        if (err)
+                return err;
+        /* Add the node to the leaf node cache */
+        err = lnc_add(c, zbr, node);
+        return err;
+}
+/**
+ * try_read_node - read a node if it is a node.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to read to
+ * @type: node type
+ * @len: node length (not aligned)
+ * @lnum: LEB number of node to read
+ * @offs: offset of node to read
+ *
+ * This function tries to read a node of known type and length, checks it and
+ * stores it in @buf. This function returns %1 if a node is present and %0 if
+ * a node is not present. A negative error code is returned for I/O errors.
+ * This function performs that same function as ubifs_read_node except that
+ * it does not require that there is actually a node present and instead
+ * the return code indicates if a node was read.
+ */
+static int try_read_node(const struct ubifs_info *c, void *buf, int type,
+                         int len, int lnum, int offs)
+{
+        int err, node_len;
+        struct ubifs_ch *ch = buf;
+        uint32_t crc, node_crc;
+        dbg_io("LEB %d:%d, %s, length %d", lnum, offs, dbg_ntype(type), len);
+        err = ubi_read(c->ubi, lnum, buf, offs, len);
+        if (err) {
+                ubifs_err("cannot read node type %d from LEB %d:%d, error %d",
+                          type, lnum, offs, err);
+                return err;
+        }
+        if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC)
+                return 0;
+        if (ch->node_type != type)
+                return 0;
+        node_len = le32_to_cpu(ch->len);
+        if (node_len != len)
+                return 0;
+        crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
+        node_crc = le32_to_cpu(ch->crc);
+        if (crc != node_crc)
+                return 0;
+        return 1;
+}
+/**
+ * fallible_read_node - try to read a leaf node.
+ * @c: UBIFS file-system description object
+ * @key:  key of node to read
+ * @zbr:  position of node
+ * @node: node returned
+ *
+ * This function tries to read a node and returns %1 if the node is read, %0
+ * if the node is not present, and a negative error code in the case of error.
+ */
+static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key,
+                              struct ubifs_zbranch *zbr, void *node)
+{
+        int ret;
+        dbg_tnc("LEB %d:%d, key %s", zbr->lnum, zbr->offs, DBGKEY(key));
+        ret = try_read_node(c, node, key_type(c, key), zbr->len, zbr->lnum,
+                            zbr->offs);
+        if (ret == 1) {
+                union ubifs_key node_key;
+                struct ubifs_dent_node *dent = node;
+                /* All nodes have key in the same place */
+                key_read(c, &dent->key, &node_key);
+                if (keys_cmp(c, key, &node_key) != 0)
+                        ret = 0;
+        }
+        if (ret == 0)
+                dbg_mnt("dangling branch LEB %d:%d len %d, key %s",
+                        zbr->lnum, zbr->offs, zbr->len, DBGKEY(key));
+        return ret;
+}
+/**
+ * matches_name - determine if a direntry or xattr entry matches a given name.
+ * @c: UBIFS file-system description object
+ * @zbr: zbranch of dent
+ * @nm: name to match
+ *
+ * This function checks if xentry/direntry referred by zbranch @zbr matches name
+ * @nm. Returns %NAME_MATCHES if it does, %NAME_LESS if the name referred by
+ * @zbr is less than @nm, and %NAME_GREATER if it is greater than @nm. In case
+ * of failure, a negative error code is returned.
+ */
+static int matches_name(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+                        const struct qstr *nm)
+{
+        struct ubifs_dent_node *dent;
+        int nlen, err;
+        /* If possible, match against the dent in the leaf node cache */
+        if (!zbr->leaf) {
+                dent = kmalloc(zbr->len, GFP_NOFS);
+                if (!dent)
+                        return -ENOMEM;
+                err = ubifs_tnc_read_node(c, zbr, dent);
+                if (err)
+                        goto out_free;
+                /* Add the node to the leaf node cache */
+                err = lnc_add_directly(c, zbr, dent);
+                if (err)
+                        goto out_free;
+        } else
+                dent = zbr->leaf;
+        nlen = le16_to_cpu(dent->nlen);
+        err = memcmp(dent->name, nm->name, min_t(int, nlen, nm->len));
+        if (err == 0) {
+                if (nlen == nm->len)
+                        return NAME_MATCHES;
+                else if (nlen < nm->len)
+                        return NAME_LESS;
+                else
+                        return NAME_GREATER;
+        } else if (err < 0)
+                return NAME_LESS;
+        else
+                return NAME_GREATER;
+out_free:
+        kfree(dent);
+        return err;
+}
+/**
+ * get_znode - get a TNC znode that may not be loaded yet.
+ * @c: UBIFS file-system description object
+ * @znode: parent znode
+ * @n: znode branch slot number
+ *
+ * This function returns the znode or a negative error code.
+ */
+static struct ubifs_znode *get_znode(struct ubifs_info *c,
+                                     struct ubifs_znode *znode, int n)
+{
+        struct ubifs_zbranch *zbr;
+        zbr = &znode->zbranch[n];
+        if (zbr->znode)
+                znode = zbr->znode;
+        else
+                znode = ubifs_load_znode(c, zbr, znode, n);
+        return znode;
+}
+/**
+ * tnc_next - find next TNC entry.
+ * @c: UBIFS file-system description object
+ * @zn: znode is passed and returned here
+ * @n: znode branch slot number is passed and returned here
+ *
+ * This function returns %0 if the next TNC entry is found, %-ENOENT if there is
+ * no next entry, or a negative error code otherwise.
+ */
+static int tnc_next(struct ubifs_info *c, struct ubifs_znode **zn, int *n)
+{
+        struct ubifs_znode *znode = *zn;
+        int nn = *n;
+        nn += 1;
+        if (nn < znode->child_cnt) {
+                *n = nn;
+                return 0;
+        }
+        while (1) {
+                struct ubifs_znode *zp;
+                zp = znode->parent;
+                if (!zp)
+                        return -ENOENT;
+                nn = znode->iip + 1;
+                znode = zp;
+                if (nn < znode->child_cnt) {
+                        znode = get_znode(c, znode, nn);
+                        if (IS_ERR(znode))
+                                return PTR_ERR(znode);
+                        while (znode->level != 0) {
+                                znode = get_znode(c, znode, 0);
+                                if (IS_ERR(znode))
+                                        return PTR_ERR(znode);
+                        }
+                        nn = 0;
+                        break;
+                }
+        }
+        *zn = znode;
+        *n = nn;
+        return 0;
+}
+/**
+ * tnc_prev - find previous TNC entry.
+ * @c: UBIFS file-system description object
+ * @zn: znode is returned here
+ * @n: znode branch slot number is passed and returned here
+ *
+ * This function returns %0 if the previous TNC entry is found, %-ENOENT if
+ * there is no next entry, or a negative error code otherwise.
+ */
+static int tnc_prev(struct ubifs_info *c, struct ubifs_znode **zn, int *n)
+{
+        struct ubifs_znode *znode = *zn;
+        int nn = *n;
+        if (nn > 0) {
+                *n = nn - 1;
+                return 0;
+        }
+        while (1) {
+                struct ubifs_znode *zp;
+                zp = znode->parent;
+                if (!zp)
+                        return -ENOENT;
+                nn = znode->iip - 1;
+                znode = zp;
+                if (nn >= 0) {
+                        znode = get_znode(c, znode, nn);
+                        if (IS_ERR(znode))
+                                return PTR_ERR(znode);
+                        while (znode->level != 0) {
+                                nn = znode->child_cnt - 1;
+                                znode = get_znode(c, znode, nn);
+                                if (IS_ERR(znode))
+                                        return PTR_ERR(znode);
+                        }
+                        nn = znode->child_cnt - 1;
+                        break;
+                }
+        }
+        *zn = znode;
+        *n = nn;
+        return 0;
+}
+/**
+ * resolve_collision - resolve a collision.
+ * @c: UBIFS file-system description object
+ * @key: key of a directory or extended attribute entry
+ * @zn: znode is returned here
+ * @n: zbranch number is passed and returned here
+ * @nm: name of the entry
+ *
+ * This function is called for "hashed" keys to make sure that the found key
+ * really corresponds to the looked up node (directory or extended attribute
+ * entry). It returns %1 and sets @zn and @n if the collision is resolved.
+ * %0 is returned if @nm is not found and @zn and @n are set to the previous
+ * entry, i.e. to the entry after which @nm could follow if it were in TNC.
+ * This means that @n may be set to %-1 if the leftmost key in @zn is the
+ * previous one. A negative error code is returned on failures.
+ */
+static int resolve_collision(struct ubifs_info *c, const union ubifs_key *key,
+                             struct ubifs_znode **zn, int *n,
+                             const struct qstr *nm)
+{
+        int err;
+        err = matches_name(c, &(*zn)->zbranch[*n], nm);
+        if (unlikely(err < 0))
+                return err;
+        if (err == NAME_MATCHES)
+                return 1;
+        if (err == NAME_GREATER) {
+                /* Look left */
+                while (1) {
+                        err = tnc_prev(c, zn, n);
+                        if (err == -ENOENT) {
+                                ubifs_assert(*n == 0);
+                                *n = -1;
+                                return 0;
+                        }
+                        if (err < 0)
+                                return err;
+                        if (keys_cmp(c, &(*zn)->zbranch[*n].key, key)) {
+                                /*
+                                 * We have found the branch after which we would
+                                 * like to insert, but inserting in this znode
+                                 * may still be wrong. Consider the following 3
+                                 * znodes, in the case where we are resolving a
+                                 * collision with Key2.
+                                 *
+                                 *                  znode zp
+                                 *            ----------------------
+                                 * level 1     |  Key0  |  Key1  |
+                                 *            -----------------------
+                                 *                 |            |
+                                 *       znode za  |            |  znode zb
+                                 *          ------------      ------------
+                                 * level 0  |  Key0  |        |  Key2  |
+                                 *          ------------      ------------
+                                 *
+                                 * The lookup finds Key2 in znode zb. Lets say
+                                 * there is no match and the name is greater so
+                                 * we look left. When we find Key0, we end up
+                                 * here. If we return now, we will insert into
+                                 * znode za at slot n = 1.  But that is invalid
+                                 * according to the parent's keys.  Key2 must
+                                 * be inserted into znode zb.
+                                 *
+                                 * Note, this problem is not relevant for the
+                                 * case when we go right, because
+                                 * 'tnc_insert()' would correct the parent key.
+                                 */
+                                if (*n == (*zn)->child_cnt - 1) {
+                                        err = tnc_next(c, zn, n);
+                                        if (err) {
+                                                /* Should be impossible */
+                                                ubifs_assert(0);
+                                                if (err == -ENOENT)
+                                                        err = -EINVAL;
+                                                return err;
+                                        }
+                                        ubifs_assert(*n == 0);
+                                        *n = -1;
+                                }
+                                return 0;
+                        }
+                        err = matches_name(c, &(*zn)->zbranch[*n], nm);
+                        if (err < 0)
+                                return err;
+                        if (err == NAME_LESS)
+                                return 0;
+                        if (err == NAME_MATCHES)
+                                return 1;
+                        ubifs_assert(err == NAME_GREATER);
+                }
+        } else {
+                int nn = *n;
+                struct ubifs_znode *znode = *zn;
+                /* Look right */
+                while (1) {
+                        err = tnc_next(c, &znode, &nn);
+                        if (err == -ENOENT)
+                                return 0;
+                        if (err < 0)
+                                return err;
+                        if (keys_cmp(c, &znode->zbranch[nn].key, key))
+                                return 0;
+                        err = matches_name(c, &znode->zbranch[nn], nm);
+                        if (err < 0)
+                                return err;
+                        if (err == NAME_GREATER)
+                                return 0;
+                        *zn = znode;
+                        *n = nn;
+                        if (err == NAME_MATCHES)
+                                return 1;
+                        ubifs_assert(err == NAME_LESS);
+                }
+        }
+}
+/**
+ * fallible_matches_name - determine if a dent matches a given name.
+ * @c: UBIFS file-system description object
+ * @zbr: zbranch of dent
+ * @nm: name to match
+ *
+ * This is a "fallible" version of 'matches_name()' function which does not
+ * panic if the direntry/xentry referred by @zbr does not exist on the media.
+ *
+ * This function checks if xentry/direntry referred by zbranch @zbr matches name
+ * @nm. Returns %NAME_MATCHES it does, %NAME_LESS if the name referred by @zbr
+ * is less than @nm, %NAME_GREATER if it is greater than @nm, and @NOT_ON_MEDIA
+ * if xentry/direntry referred by @zbr does not exist on the media. A negative
+ * error code is returned in case of failure.
+ */
+static int fallible_matches_name(struct ubifs_info *c,
+                                 struct ubifs_zbranch *zbr,
+                                 const struct qstr *nm)
+{
+        struct ubifs_dent_node *dent;
+        int nlen, err;
+        /* If possible, match against the dent in the leaf node cache */
+        if (!zbr->leaf) {
+                dent = kmalloc(zbr->len, GFP_NOFS);
+                if (!dent)
+                        return -ENOMEM;
+                err = fallible_read_node(c, &zbr->key, zbr, dent);
+                if (err < 0)
+                        goto out_free;
+                if (err == 0) {
+                        /* The node was not present */
+                        err = NOT_ON_MEDIA;
+                        goto out_free;
+                }
+                ubifs_assert(err == 1);
+                err = lnc_add_directly(c, zbr, dent);
+                if (err)
+                        goto out_free;
+        } else
+                dent = zbr->leaf;
+        nlen = le16_to_cpu(dent->nlen);
+        err = memcmp(dent->name, nm->name, min_t(int, nlen, nm->len));
+        if (err == 0) {
+                if (nlen == nm->len)
+                        return NAME_MATCHES;
+                else if (nlen < nm->len)
+                        return NAME_LESS;
+                else
+                        return NAME_GREATER;
+        } else if (err < 0)
+                return NAME_LESS;
+        else
+                return NAME_GREATER;
+out_free:
+        kfree(dent);
+        return err;
+}
+/**
+ * fallible_resolve_collision - resolve a collision even if nodes are missing.
+ * @c: UBIFS file-system description object
+ * @key: key
+ * @zn: znode is returned here
+ * @n: branch number is passed and returned here
+ * @nm: name of directory entry
+ * @adding: indicates caller is adding a key to the TNC
+ *
+ * This is a "fallible" version of the 'resolve_collision()' function which
+ * does not panic if one of the nodes referred to by TNC does not exist on the
+ * media. This may happen when replaying the journal if a deleted node was
+ * Garbage-collected and the commit was not done. A branch that refers to a node
+ * that is not present is called a dangling branch. The following are the return
+ * codes for this function:
+ *  o if @nm was found, %1 is returned and @zn and @n are set to the found
+ *    branch;
+ *  o if we are @adding and @nm was not found, %0 is returned;
+ *  o if we are not @adding and @nm was not found, but a dangling branch was
+ *    found, then %1 is returned and @zn and @n are set to the dangling branch;
+ *  o a negative error code is returned in case of failure.
+ */
+static int fallible_resolve_collision(struct ubifs_info *c,
+                                      const union ubifs_key *key,
+                                      struct ubifs_znode **zn, int *n,
+                                      const struct qstr *nm, int adding)
+{
+        struct ubifs_znode *o_znode = NULL, *znode = *zn;
+        int uninitialized_var(o_n), err, cmp, unsure = 0, nn = *n;
+        cmp = fallible_matches_name(c, &znode->zbranch[nn], nm);
+        if (unlikely(cmp < 0))
+                return cmp;
+        if (cmp == NAME_MATCHES)
+                return 1;
+        if (cmp == NOT_ON_MEDIA) {
+                o_znode = znode;
+                o_n = nn;
+                /*
+                 * We are unlucky and hit a dangling branch straight away.
+                 * Now we do not really know where to go to find the needed
+                 * branch - to the left or to the right. Well, let's try left.
+                 */
+                unsure = 1;
+        } else if (!adding)
+                unsure = 1; /* Remove a dangling branch wherever it is */
+        if (cmp == NAME_GREATER || unsure) {
+                /* Look left */
+                while (1) {
+                        err = tnc_prev(c, zn, n);
+                        if (err == -ENOENT) {
+                                ubifs_assert(*n == 0);
+                                *n = -1;
+                                break;
+                        }
+                        if (err < 0)
+                                return err;
+                        if (keys_cmp(c, &(*zn)->zbranch[*n].key, key)) {
+                                /* See comments in 'resolve_collision()' */
+                                if (*n == (*zn)->child_cnt - 1) {
+                                        err = tnc_next(c, zn, n);
+                                        if (err) {
+                                                /* Should be impossible */
+                                                ubifs_assert(0);
+                                                if (err == -ENOENT)
+                                                        err = -EINVAL;
+                                                return err;
+                                        }
+                                        ubifs_assert(*n == 0);
+                                        *n = -1;
+                                }
+                                break;
+                        }
+                        err = fallible_matches_name(c, &(*zn)->zbranch[*n], nm);
+                        if (err < 0)
+                                return err;
+                        if (err == NAME_MATCHES)
+                                return 1;
+                        if (err == NOT_ON_MEDIA) {
+                                o_znode = *zn;
+                                o_n = *n;
+                                continue;
+                        }
+                        if (!adding)
+                                continue;
+                        if (err == NAME_LESS)
+                                break;
+                        else
+                                unsure = 0;
+                }
+        }
+        if (cmp == NAME_LESS || unsure) {
+                /* Look right */
+                *zn = znode;
+                *n = nn;
+                while (1) {
+                        err = tnc_next(c, &znode, &nn);
+                        if (err == -ENOENT)
+                                break;
+                        if (err < 0)
+                                return err;
+                        if (keys_cmp(c, &znode->zbranch[nn].key, key))
+                                break;
+                        err = fallible_matches_name(c, &znode->zbranch[nn], nm);
+                        if (err < 0)
+                                return err;
+                        if (err == NAME_GREATER)
+                                break;
+                        *zn = znode;
+                        *n = nn;
+                        if (err == NAME_MATCHES)
+                                return 1;
+                        if (err == NOT_ON_MEDIA) {
+                                o_znode = znode;
+                                o_n = nn;
+                        }
+                }
+        }
+        /* Never match a dangling branch when adding */
+        if (adding || !o_znode)
+                return 0;
+        dbg_mnt("dangling match LEB %d:%d len %d %s",
+                o_znode->zbranch[o_n].lnum, o_znode->zbranch[o_n].offs,
+                o_znode->zbranch[o_n].len, DBGKEY(key));
+        *zn = o_znode;
+        *n = o_n;
+        return 1;
+}
+/**
+ * matches_position - determine if a zbranch matches a given position.
+ * @zbr: zbranch of dent
+ * @lnum: LEB number of dent to match
+ * @offs: offset of dent to match
+ *
+ * This function returns %1 if @lnum:@offs matches, and %0 otherwise.
+ */
+static int matches_position(struct ubifs_zbranch *zbr, int lnum, int offs)
+{
+        if (zbr->lnum == lnum && zbr->offs == offs)
+                return 1;
+        else
+                return 0;
+}
+/**
+ * resolve_collision_directly - resolve a collision directly.
+ * @c: UBIFS file-system description object
+ * @key: key of directory entry
+ * @zn: znode is passed and returned here
+ * @n: zbranch number is passed and returned here
+ * @lnum: LEB number of dent node to match
+ * @offs: offset of dent node to match
+ *
+ * This function is used for "hashed" keys to make sure the found directory or
+ * extended attribute entry node is what was looked for. It is used when the
+ * flash address of the right node is known (@lnum:@offs) which makes it much
+ * easier to resolve collisions (no need to read entries and match full
+ * names). This function returns %1 and sets @zn and @n if the collision is
+ * resolved, %0 if @lnum:@offs is not found and @zn and @n are set to the
+ * previous directory entry. Otherwise a negative error code is returned.
+ */
+static int resolve_collision_directly(struct ubifs_info *c,
+                                      const union ubifs_key *key,
+                                      struct ubifs_znode **zn, int *n,
+                                      int lnum, int offs)
+{
+        struct ubifs_znode *znode;
+        int nn, err;
+        znode = *zn;
+        nn = *n;
+        if (matches_position(&znode->zbranch[nn], lnum, offs))
+                return 1;
+        /* Look left */
+        while (1) {
+                err = tnc_prev(c, &znode, &nn);
+                if (err == -ENOENT)
+                        break;
+                if (err < 0)
+                        return err;
+                if (keys_cmp(c, &znode->zbranch[nn].key, key))
+                        break;
+                if (matches_position(&znode->zbranch[nn], lnum, offs)) {
+                        *zn = znode;
+                        *n = nn;
+                        return 1;
+                }
+        }
+        /* Look right */
+        znode = *zn;
+        nn = *n;
+        while (1) {
+                err = tnc_next(c, &znode, &nn);
+                if (err == -ENOENT)
+                        return 0;
+                if (err < 0)
+                        return err;
+                if (keys_cmp(c, &znode->zbranch[nn].key, key))
+                        return 0;
+                *zn = znode;
+                *n = nn;
+                if (matches_position(&znode->zbranch[nn], lnum, offs))
+                        return 1;
+        }
+}
+/**
+ * dirty_cow_bottom_up - dirty a znode and its ancestors.
+ * @c: UBIFS file-system description object
+ * @znode: znode to dirty
+ *
+ * If we do not have a unique key that resides in a znode, then we cannot
+ * dirty that znode from the top down (i.e. by using lookup_level0_dirty)
+ * This function records the path back to the last dirty ancestor, and then
+ * dirties the znodes on that path.
+ */
+static struct ubifs_znode *dirty_cow_bottom_up(struct ubifs_info *c,
+                                               struct ubifs_znode *znode)
+{
+        struct ubifs_znode *zp;
+        int *path = c->bottom_up_buf, p = 0;
+        ubifs_assert(c->zroot.znode);
+        ubifs_assert(znode);
+        if (c->zroot.znode->level > BOTTOM_UP_HEIGHT) {
+                kfree(c->bottom_up_buf);
+                c->bottom_up_buf = kmalloc(c->zroot.znode->level * sizeof(int),
+                                           GFP_NOFS);
+                if (!c->bottom_up_buf)
+                        return ERR_PTR(-ENOMEM);
+                path = c->bottom_up_buf;
+        }
+        if (c->zroot.znode->level) {
+                /* Go up until parent is dirty */
+                while (1) {
+                        int n;
+                        zp = znode->parent;
+                        if (!zp)
+                                break;
+                        n = znode->iip;
+                        ubifs_assert(p < c->zroot.znode->level);
+                        path[p++] = n;
+                        if (!zp->cnext && ubifs_zn_dirty(znode))
+                                break;
+                        znode = zp;
+                }
+        }
+        /* Come back down, dirtying as we go */
+        while (1) {
+                struct ubifs_zbranch *zbr;
+                zp = znode->parent;
+                if (zp) {
+                        ubifs_assert(path[p - 1] >= 0);
+                        ubifs_assert(path[p - 1] < zp->child_cnt);
+                        zbr = &zp->zbranch[path[--p]];
+                        znode = dirty_cow_znode(c, zbr);
+                } else {
+                        ubifs_assert(znode == c->zroot.znode);
+                        znode = dirty_cow_znode(c, &c->zroot);
+                }
+                if (unlikely(IS_ERR(znode)) || !p)
+                        break;
+                ubifs_assert(path[p - 1] >= 0);
+                ubifs_assert(path[p - 1] < znode->child_cnt);
+                znode = znode->zbranch[path[p - 1]].znode;
+        }
+        return znode;
+}
+/**
+ * ubifs_lookup_level0 - search for zero-level znode.
+ * @c: UBIFS file-system description object
+ * @key:  key to lookup
+ * @zn: znode is returned here
+ * @n: znode branch slot number is returned here
+ *
+ * This function looks up the TNC tree and search for zero-level znode which
+ * refers key @key. The found zero-level znode is returned in @zn. There are 3
+ * cases:
+ *   o exact match, i.e. the found zero-level znode contains key @key, then %1
+ *     is returned and slot number of the matched branch is stored in @n;
+ *   o not exact match, which means that zero-level znode does not contain
+ *     @key, then %0 is returned and slot number of the closed branch is stored
+ *     in  @n;
+ *   o @key is so small that it is even less than the lowest key of the
+ *     leftmost zero-level node, then %0 is returned and %0 is stored in @n.
+ *
+ * Note, when the TNC tree is traversed, some znodes may be absent, then this
+ * function reads corresponding indexing nodes and inserts them to TNC. In
+ * case of failure, a negative error code is returned.
+ */
+int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
+                        struct ubifs_znode **zn, int *n)
+{
+        int err, exact;
+        struct ubifs_znode *znode;
+        unsigned long time = get_seconds();
+        dbg_tnc("search key %s", DBGKEY(key));
+        znode = c->zroot.znode;
+        if (unlikely(!znode)) {
+                znode = ubifs_load_znode(c, &c->zroot, NULL, 0);
+                if (IS_ERR(znode))
+                        return PTR_ERR(znode);
+        }
+        znode->time = time;
+        while (1) {
+                struct ubifs_zbranch *zbr;
+                exact = ubifs_search_zbranch(c, znode, key, n);
+                if (znode->level == 0)
+                        break;
+                if (*n < 0)
+                        *n = 0;
+                zbr = &znode->zbranch[*n];
+                if (zbr->znode) {
+                        znode->time = time;
+                        znode = zbr->znode;
+                        continue;
+                }
+                /* znode is not in TNC cache, load it from the media */
+                znode = ubifs_load_znode(c, zbr, znode, *n);
+                if (IS_ERR(znode))
+                        return PTR_ERR(znode);
+        }
+        *zn = znode;
+        if (exact || !is_hash_key(c, key) || *n != -1) {
+                dbg_tnc("found %d, lvl %d, n %d", exact, znode->level, *n);
+                return exact;
+        }
+        /*
+         * Here is a tricky place. We have not found the key and this is a
+         * "hashed" key, which may collide. The rest of the code deals with
+         * situations like this:
+         *
+         *                  | 3 | 5 |
+         *                  /       \
+         *          | 3 | 5 |      | 6 | 7 | (x)
+         *
+         * Or more a complex example:
+         *
+         *                | 1 | 5 |
+         *                /       \
+         *       | 1 | 3 |         | 5 | 8 |
+         *              \           /
+         *          | 5 | 5 |   | 6 | 7 | (x)
+         *
+         * In the examples, if we are looking for key "5", we may reach nodes
+         * marked with "(x)". In this case what we have do is to look at the
+         * left and see if there is "5" key there. If there is, we have to
+         * return it.
+         *
+         * Note, this whole situation is possible because we allow to have
+         * elements which are equivalent to the next key in the parent in the
+         * children of current znode. For example, this happens if we split a
+         * znode like this: | 3 | 5 | 5 | 6 | 7 |, which results in something
+         * like this:
+         *                      | 3 | 5 |
+         *                       /     \
+         *                | 3 | 5 |   | 5 | 6 | 7 |
+         *                              ^
+         * And this becomes what is at the first "picture" after key "5" marked
+         * with "^" is removed. What could be done is we could prohibit
+         * splitting in the middle of the colliding sequence. Also, when
+         * removing the leftmost key, we would have to correct the key of the
+         * parent node, which would introduce additional complications. Namely,
+         * if we changed the the leftmost key of the parent znode, the garbage
+         * collector would be unable to find it (GC is doing this when GC'ing
+         * indexing LEBs). Although we already have an additional RB-tree where
+         * we save such changed znodes (see 'ins_clr_old_idx_znode()') until
+         * after the commit. But anyway, this does not look easy to implement
+         * so we did not try this.
+         */
+        err = tnc_prev(c, &znode, n);
+        if (err == -ENOENT) {
+                dbg_tnc("found 0, lvl %d, n -1", znode->level);
+                *n = -1;
+                return 0;
+        }
+        if (unlikely(err < 0))
+                return err;
+        if (keys_cmp(c, key, &znode->zbranch[*n].key)) {
+                dbg_tnc("found 0, lvl %d, n -1", znode->level);
+                *n = -1;
+                return 0;
+        }
+        dbg_tnc("found 1, lvl %d, n %d", znode->level, *n);
+        *zn = znode;
+        return 1;
+}
+/**
+ * lookup_level0_dirty - search for zero-level znode dirtying.
+ * @c: UBIFS file-system description object
+ * @key:  key to lookup
+ * @zn: znode is returned here
+ * @n: znode branch slot number is returned here
+ *
+ * This function looks up the TNC tree and search for zero-level znode which
+ * refers key @key. The found zero-level znode is returned in @zn. There are 3
+ * cases:
+ *   o exact match, i.e. the found zero-level znode contains key @key, then %1
+ *     is returned and slot number of the matched branch is stored in @n;
+ *   o not exact match, which means that zero-level znode does not contain @key
+ *     then %0 is returned and slot number of the closed branch is stored in
+ *     @n;
+ *   o @key is so small that it is even less than the lowest key of the
+ *     leftmost zero-level node, then %0 is returned and %-1 is stored in @n.
+ *
+ * Additionally all znodes in the path from the root to the located zero-level
+ * znode are marked as dirty.
+ *
+ * Note, when the TNC tree is traversed, some znodes may be absent, then this
+ * function reads corresponding indexing nodes and inserts them to TNC. In
+ * case of failure, a negative error code is returned.
+ */
+static int lookup_level0_dirty(struct ubifs_info *c, const union ubifs_key *key,
+                               struct ubifs_znode **zn, int *n)
+{
+        int err, exact;
+        struct ubifs_znode *znode;
+        unsigned long time = get_seconds();
+        dbg_tnc("search and dirty key %s", DBGKEY(key));
+        znode = c->zroot.znode;
+        if (unlikely(!znode)) {
+                znode = ubifs_load_znode(c, &c->zroot, NULL, 0);
+                if (IS_ERR(znode))
+                        return PTR_ERR(znode);
+        }
+        znode = dirty_cow_znode(c, &c->zroot);
+        if (IS_ERR(znode))
+                return PTR_ERR(znode);
+        znode->time = time;
+        while (1) {
+                struct ubifs_zbranch *zbr;
+                exact = ubifs_search_zbranch(c, znode, key, n);
+                if (znode->level == 0)
+                        break;
+                if (*n < 0)
+                        *n = 0;
+                zbr = &znode->zbranch[*n];
+                if (zbr->znode) {
+                        znode->time = time;
+                        znode = dirty_cow_znode(c, zbr);
+                        if (IS_ERR(znode))
+                                return PTR_ERR(znode);
+                        continue;
+                }
+                /* znode is not in TNC cache, load it from the media */
+                znode = ubifs_load_znode(c, zbr, znode, *n);
+                if (IS_ERR(znode))
+                        return PTR_ERR(znode);
+                znode = dirty_cow_znode(c, zbr);
+                if (IS_ERR(znode))
+                        return PTR_ERR(znode);
+        }
+        *zn = znode;
+        if (exact || !is_hash_key(c, key) || *n != -1) {
+                dbg_tnc("found %d, lvl %d, n %d", exact, znode->level, *n);
+                return exact;
+        }
+        /*
+         * See huge comment at 'lookup_level0_dirty()' what is the rest of the
+         * code.
+         */
+        err = tnc_prev(c, &znode, n);
+        if (err == -ENOENT) {
+                *n = -1;
+                dbg_tnc("found 0, lvl %d, n -1", znode->level);
+                return 0;
+        }
+        if (unlikely(err < 0))
+                return err;
+        if (keys_cmp(c, key, &znode->zbranch[*n].key)) {
+                *n = -1;
+                dbg_tnc("found 0, lvl %d, n -1", znode->level);
+                return 0;
+        }
+        if (znode->cnext || !ubifs_zn_dirty(znode)) {
+                znode = dirty_cow_bottom_up(c, znode);
+                if (IS_ERR(znode))
+                        return PTR_ERR(znode);
+        }
+        dbg_tnc("found 1, lvl %d, n %d", znode->level, *n);
+        *zn = znode;
+        return 1;
+}
+/**
+ * ubifs_tnc_lookup - look up a file-system node.
+ * @c: UBIFS file-system description object
+ * @key: node key to lookup
+ * @node: the node is returned here
+ *
+ * This function look up and reads node with key @key. The caller has to make
+ * sure the @node buffer is large enough to fit the node. Returns zero in case
+ * of success, %-ENOENT if the node was not found, and a negative error code in
+ * case of failure.
+ */
+int ubifs_tnc_lookup(struct ubifs_info *c, const union ubifs_key *key,
+                     void *node)
+{
+        int found, n, err;
+        struct ubifs_znode *znode;
+        struct ubifs_zbranch zbr, *zt;
+        mutex_lock(&c->tnc_mutex);
+        found = ubifs_lookup_level0(c, key, &znode, &n);
+        if (!found) {
+                err = -ENOENT;
+                goto out;
+        } else if (found < 0) {
+                err = found;
+                goto out;
+        }
+        zt = &znode->zbranch[n];
+        if (is_hash_key(c, key)) {
+                /*
+                 * In this case the leaf node cache gets used, so we pass the
+                 * address of the zbranch and keep the mutex locked
+                 */
+                err = tnc_read_node_nm(c, zt, node);
+                goto out;
+        }
+        zbr = znode->zbranch[n];
+        mutex_unlock(&c->tnc_mutex);
+        err = ubifs_tnc_read_node(c, &zbr, node);
+        return err;
+out:
+        mutex_unlock(&c->tnc_mutex);
+        return err;
+}
+/**
+ * ubifs_tnc_locate - look up a file-system node and return it and its location.
+ * @c: UBIFS file-system description object
+ * @key: node key to lookup
+ * @node: the node is returned here
+ * @lnum: LEB number is returned here
+ * @offs: offset is returned here
+ *
+ * This function is the same as 'ubifs_tnc_lookup()' but it returns the node
+ * location also. See 'ubifs_tnc_lookup()'.
+ */
+int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
+                     void *node, int *lnum, int *offs)
+{
+        int found, n, err;
+        struct ubifs_znode *znode;
+        struct ubifs_zbranch zbr, *zt;
+        mutex_lock(&c->tnc_mutex);
+        found = ubifs_lookup_level0(c, key, &znode, &n);
+        if (!found) {
+                err = -ENOENT;
+                goto out;
+        } else if (found < 0) {
+                err = found;
+                goto out;
+        }
+        zt = &znode->zbranch[n];
+        if (is_hash_key(c, key)) {
+                /*
+                 * In this case the leaf node cache gets used, so we pass the
+                 * address of the zbranch and keep the mutex locked
+                 */
+                *lnum = zt->lnum;
+                *offs = zt->offs;
+                err = tnc_read_node_nm(c, zt, node);
+                goto out;
+        }
+        zbr = znode->zbranch[n];
+        mutex_unlock(&c->tnc_mutex);
+        *lnum = zbr.lnum;
+        *offs = zbr.offs;
+        err = ubifs_tnc_read_node(c, &zbr, node);
+        return err;
+out:
+        mutex_unlock(&c->tnc_mutex);
+        return err;
+}
+/**
+ * do_lookup_nm- look up a "hashed" node.
+ * @c: UBIFS file-system description object
+ * @key: node key to lookup
+ * @node: the node is returned here
+ * @nm: node name
+ *
+ * This function look up and reads a node which contains name hash in the key.
+ * Since the hash may have collisions, there may be many nodes with the same
+ * key, so we have to sequentially look to all of them until the needed one is
+ * found. This function returns zero in case of success, %-ENOENT if the node
+ * was not found, and a negative error code in case of failure.
+ */
+static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
+                        void *node, const struct qstr *nm)
+{
+        int found, n, err;
+        struct ubifs_znode *znode;
+        struct ubifs_zbranch zbr;
+        dbg_tnc("name '%.*s' key %s", nm->len, nm->name, DBGKEY(key));
+        mutex_lock(&c->tnc_mutex);
+        found = ubifs_lookup_level0(c, key, &znode, &n);
+        if (!found) {
+                err = -ENOENT;
+                goto out_unlock;
+        } else if (found < 0) {
+                err = found;
+                goto out_unlock;
+        }
+        ubifs_assert(n >= 0);
+        err = resolve_collision(c, key, &znode, &n, nm);
+        dbg_tnc("rc returned %d, znode %p, n %d", err, znode, n);
+        if (unlikely(err < 0))
+                goto out_unlock;
+        if (err == 0) {
+                err = -ENOENT;
+                goto out_unlock;
+        }
+        zbr = znode->zbranch[n];
+        mutex_unlock(&c->tnc_mutex);
+        err = tnc_read_node_nm(c, &zbr, node);
+        return err;
+out_unlock:
+        mutex_unlock(&c->tnc_mutex);
+        return err;
+}
+/**
+ * ubifs_tnc_lookup_nm - look up a "hashed" node.
+ * @c: UBIFS file-system description object
+ * @key: node key to lookup
+ * @node: the node is returned here
+ * @nm: node name
+ *
+ * This function look up and reads a node which contains name hash in the key.
+ * Since the hash may have collisions, there may be many nodes with the same
+ * key, so we have to sequentially look to all of them until the needed one is
+ * found. This function returns zero in case of success, %-ENOENT if the node
+ * was not found, and a negative error code in case of failure.
+ */
+int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
+                        void *node, const struct qstr *nm)
+{
+        int err, len;
+        const struct ubifs_dent_node *dent = node;
+        /*
+         * We assume that in most of the cases there are no name collisions and
+         * 'ubifs_tnc_lookup()' returns us the right direntry.
+         */
+        err = ubifs_tnc_lookup(c, key, node);
+        if (err)
+                return err;
+        len = le16_to_cpu(dent->nlen);
+        if (nm->len == len && !memcmp(dent->name, nm->name, len))
+                return 0;
+        /*
+         * Unluckily, there are hash collisions and we have to iterate over
+         * them look at each direntry with colliding name hash sequentially.
+         */
+        return do_lookup_nm(c, key, node, nm);
+}
+/**
+ * correct_parent_keys - correct parent znodes' keys.
+ * @c: UBIFS file-system description object
+ * @znode: znode to correct parent znodes for
+ *
+ * This is a helper function for 'tnc_insert()'. When the key of the leftmost
+ * zbranch changes, keys of parent znodes have to be corrected. This helper
+ * function is called in such situations and corrects the keys if needed.
+ */
+static void correct_parent_keys(const struct ubifs_info *c,
+                                struct ubifs_znode *znode)
+{
+        union ubifs_key *key, *key1;
+        ubifs_assert(znode->parent);
+        ubifs_assert(znode->iip == 0);
+        key = &znode->zbranch[0].key;
+        key1 = &znode->parent->zbranch[0].key;
+        while (keys_cmp(c, key, key1) < 0) {
+                key_copy(c, key, key1);
+                znode = znode->parent;
+                znode->alt = 1;
+                if (!znode->parent || znode->iip)
+                        break;
+                key1 = &znode->parent->zbranch[0].key;
+        }
+}
+/**
+ * insert_zbranch - insert a zbranch into a znode.
+ * @znode: znode into which to insert
+ * @zbr: zbranch to insert
+ * @n: slot number to insert to
+ *
+ * This is a helper function for 'tnc_insert()'. UBIFS does not allow "gaps" in
+ * znode's array of zbranches and keeps zbranches consolidated, so when a new
+ * zbranch has to be inserted to the @znode->zbranches[]' array at the @n-th
+ * slot, zbranches starting from @n have to be moved right.
+ */
+static void insert_zbranch(struct ubifs_znode *znode,
+                           const struct ubifs_zbranch *zbr, int n)
+{
+        int i;
+        ubifs_assert(ubifs_zn_dirty(znode));
+        if (znode->level) {
+                for (i = znode->child_cnt; i > n; i--) {
+                        znode->zbranch[i] = znode->zbranch[i - 1];
+                        if (znode->zbranch[i].znode)
+                                znode->zbranch[i].znode->iip = i;
+                }
+                if (zbr->znode)
+                        zbr->znode->iip = n;
+        } else
+                for (i = znode->child_cnt; i > n; i--)
+                        znode->zbranch[i] = znode->zbranch[i - 1];
+        znode->zbranch[n] = *zbr;
+        znode->child_cnt += 1;
+        /*
+         * After inserting at slot zero, the lower bound of the key range of
+         * this znode may have changed. If this znode is subsequently split
+         * then the upper bound of the key range may change, and furthermore
+         * it could change to be lower than the original lower bound. If that
+         * happens, then it will no longer be possible to find this znode in the
+         * TNC using the key from the index node on flash. That is bad because
+         * if it is not found, we will assume it is obsolete and may overwrite
+         * it. Then if there is an unclean unmount, we will start using the
+         * old index which will be broken.
+         *
+         * So we first mark znodes that have insertions at slot zero, and then
+         * if they are split we add their lnum/offs to the old_idx tree.
+         */
+        if (n == 0)
+                znode->alt = 1;
+}
+/**
+ * tnc_insert - insert a node into TNC.
+ * @c: UBIFS file-system description object
+ * @znode: znode to insert into
+ * @zbr: branch to insert
+ * @n: slot number to insert new zbranch to
+ *
+ * This function inserts a new node described by @zbr into znode @znode. If
+ * znode does not have a free slot for new zbranch, it is split. Parent znodes
+ * are splat as well if needed. Returns zero in case of success or a negative
+ * error code in case of failure.
+ */
+static int tnc_insert(struct ubifs_info *c, struct ubifs_znode *znode,
+                      struct ubifs_zbranch *zbr, int n)
+{
+        struct ubifs_znode *zn, *zi, *zp;
+        int i, keep, move, appending = 0;
+        union ubifs_key *key = &zbr->key;
+        ubifs_assert(n >= 0 && n <= c->fanout);
+        /* Implement naive insert for now */
+again:
+        zp = znode->parent;
+        if (znode->child_cnt < c->fanout) {
+                ubifs_assert(n != c->fanout);
+                dbg_tnc("inserted at %d level %d, key %s", n, znode->level,
+                        DBGKEY(key));
+                insert_zbranch(znode, zbr, n);
+                /* Ensure parent's key is correct */
+                if (n == 0 && zp && znode->iip == 0)
+                        correct_parent_keys(c, znode);
+                return 0;
+        }
+        /*
+         * Unfortunately, @znode does not have more empty slots and we have to
+         * split it.
+         */
+        dbg_tnc("splitting level %d, key %s", znode->level, DBGKEY(key));
+        if (znode->alt)
+                /*
+                 * We can no longer be sure of finding this znode by key, so we
+                 * record it in the old_idx tree.
+                 */
+                ins_clr_old_idx_znode(c, znode);
+        zn = kzalloc(c->max_znode_sz, GFP_NOFS);
+        if (!zn)
+                return -ENOMEM;
+        zn->parent = zp;
+        zn->level = znode->level;
+        /* Decide where to split */
+        if (znode->level == 0 && n == c->fanout &&
+            key_type(c, key) == UBIFS_DATA_KEY) {
+                union ubifs_key *key1;
+                /*
+                 * If this is an inode which is being appended - do not split
+                 * it because no other zbranches can be inserted between
+                 * zbranches of consecutive data nodes anyway.
+                 */
+                key1 = &znode->zbranch[n - 1].key;
+                if (key_inum(c, key1) == key_inum(c, key) &&
+                    key_type(c, key1) == UBIFS_DATA_KEY &&
+                    key_block(c, key1) == key_block(c, key) - 1)
+                        appending = 1;
+        }
+        if (appending) {
+                keep = c->fanout;
+                move = 0;
+        } else {
+                keep = (c->fanout + 1) / 2;
+                move = c->fanout - keep;
+        }
+        /*
+         * Although we don't at present, we could look at the neighbors and see
+         * if we can move some zbranches there.
+         */
+        if (n < keep) {
+                /* Insert into existing znode */
+                zi = znode;
+                move += 1;
+                keep -= 1;
+        } else {
+                /* Insert into new znode */
+                zi = zn;
+                n -= keep;
+                /* Re-parent */
+                if (zn->level != 0)
+                        zbr->znode->parent = zn;
+        }
+        __set_bit(DIRTY_ZNODE, &zn->flags);
+        atomic_long_inc(&c->dirty_zn_cnt);
+        zn->child_cnt = move;
+        znode->child_cnt = keep;
+        dbg_tnc("moving %d, keeping %d", move, keep);
+        /* Move zbranch */
+        for (i = 0; i < move; i++) {
+                zn->zbranch[i] = znode->zbranch[keep + i];
+                /* Re-parent */
+                if (zn->level != 0)
+                        if (zn->zbranch[i].znode) {
+                                zn->zbranch[i].znode->parent = zn;
+                                zn->zbranch[i].znode->iip = i;
+                        }
+        }
+        /* Insert new key and branch */
+        dbg_tnc("inserting at %d level %d, key %s", n, zn->level, DBGKEY(key));
+        insert_zbranch(zi, zbr, n);
+        /* Insert new znode (produced by spitting) into the parent */
+        if (zp) {
+                i = n;
+                /* Locate insertion point */
+                n = znode->iip + 1;
+                if (appending && n != c->fanout)
+                        appending = 0;
+                if (i == 0 && zi == znode && znode->iip == 0)
+                        correct_parent_keys(c, znode);
+                /* Tail recursion */
+                zbr->key = zn->zbranch[0].key;
+                zbr->znode = zn;
+                zbr->lnum = 0;
+                zbr->offs = 0;
+                zbr->len = 0;
+                znode = zp;
+                goto again;
+        }
+        /* We have to split root znode */
+        dbg_tnc("creating new zroot at level %d", znode->level + 1);
+        zi = kzalloc(c->max_znode_sz, GFP_NOFS);
+        if (!zi)
+                return -ENOMEM;
+        zi->child_cnt = 2;
+        zi->level = znode->level + 1;
+        __set_bit(DIRTY_ZNODE, &zi->flags);
+        atomic_long_inc(&c->dirty_zn_cnt);
+        zi->zbranch[0].key = znode->zbranch[0].key;
+        zi->zbranch[0].znode = znode;
+        zi->zbranch[0].lnum = c->zroot.lnum;
+        zi->zbranch[0].offs = c->zroot.offs;
+        zi->zbranch[0].len = c->zroot.len;
+        zi->zbranch[1].key = zn->zbranch[0].key;
+        zi->zbranch[1].znode = zn;
+        c->zroot.lnum = 0;
+        c->zroot.offs = 0;
+        c->zroot.len = 0;
+        c->zroot.znode = zi;
+        zn->parent = zi;
+        zn->iip = 1;
+        znode->parent = zi;
+        znode->iip = 0;
+        return 0;
+}
+/**
+ * ubifs_tnc_add - add a node to TNC.
+ * @c: UBIFS file-system description object
+ * @key: key to add
+ * @lnum: LEB number of node
+ * @offs: node offset
+ * @len: node length
+ *
+ * This function adds a node with key @key to TNC. The node may be new or it may
+ * obsolete some existing one. Returns %0 on success or negative error code on
+ * failure.
+ */
+int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum,
+                  int offs, int len)
+{
+        int found, n, err = 0;
+        struct ubifs_znode *znode;
+        mutex_lock(&c->tnc_mutex);
+        dbg_tnc("%d:%d, len %d, key %s", lnum, offs, len, DBGKEY(key));
+        found = lookup_level0_dirty(c, key, &znode, &n);
+        if (!found) {
+                struct ubifs_zbranch zbr;
+                zbr.znode = NULL;
+                zbr.lnum = lnum;
+                zbr.offs = offs;
+                zbr.len = len;
+                key_copy(c, key, &zbr.key);
+                err = tnc_insert(c, znode, &zbr, n + 1);
+        } else if (found == 1) {
+                struct ubifs_zbranch *zbr = &znode->zbranch[n];
+                lnc_free(zbr);
+                err = ubifs_add_dirt(c, zbr->lnum, zbr->len);
+                zbr->lnum = lnum;
+                zbr->offs = offs;
+                zbr->len = len;
+        } else
+                err = found;
+        if (!err)
+                err = dbg_check_tnc(c, 0);
+        mutex_unlock(&c->tnc_mutex);
+        return err;
+}
+/**
+ * ubifs_tnc_replace - replace a node in the TNC only if the old node is found.
+ * @c: UBIFS file-system description object
+ * @key: key to add
+ * @old_lnum: LEB number of old node
+ * @old_offs: old node offset
+ * @lnum: LEB number of node
+ * @offs: node offset
+ * @len: node length
+ *
+ * This function replaces a node with key @key in the TNC only if the old node
+ * is found.  This function is called by garbage collection when node are moved.
+ * Returns %0 on success or negative error code on failure.
+ */
+int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key,
+                      int old_lnum, int old_offs, int lnum, int offs, int len)
+{
+        int found, n, err = 0;
+        struct ubifs_znode *znode;
+        mutex_lock(&c->tnc_mutex);
+        dbg_tnc("old LEB %d:%d, new LEB %d:%d, len %d, key %s", old_lnum,
+                old_offs, lnum, offs, len, DBGKEY(key));
+        found = lookup_level0_dirty(c, key, &znode, &n);
+        if (found < 0) {
+                err = found;
+                goto out_unlock;
+        }
+        if (found == 1) {
+                struct ubifs_zbranch *zbr = &znode->zbranch[n];
+                found = 0;
+                if (zbr->lnum == old_lnum && zbr->offs == old_offs) {
+                        lnc_free(zbr);
+                        err = ubifs_add_dirt(c, zbr->lnum, zbr->len);
+                        if (err)
+                                goto out_unlock;
+                        zbr->lnum = lnum;
+                        zbr->offs = offs;
+                        zbr->len = len;
+                        found = 1;
+                } else if (is_hash_key(c, key)) {
+                        found = resolve_collision_directly(c, key, &znode, &n,
+                                                           old_lnum, old_offs);
+                        dbg_tnc("rc returned %d, znode %p, n %d, LEB %d:%d",
+                                found, znode, n, old_lnum, old_offs);
+                        if (found < 0) {
+                                err = found;
+                                goto out_unlock;
+                        }
+                        if (found) {
+                                /* Ensure the znode is dirtied */
+                                if (znode->cnext || !ubifs_zn_dirty(znode)) {
+                                            znode = dirty_cow_bottom_up(c,
+                                                                        znode);
+                                            if (IS_ERR(znode)) {
+                                                    err = PTR_ERR(znode);
+                                                    goto out_unlock;
+                                            }
+                                }
+                                zbr = &znode->zbranch[n];
+                                lnc_free(zbr);
+                                err = ubifs_add_dirt(c, zbr->lnum,
+                                                     zbr->len);
+                                if (err)
+                                        goto out_unlock;
+                                zbr->lnum = lnum;
+                                zbr->offs = offs;
+                                zbr->len = len;
+                        }
+                }
+        }
+        if (!found)
+                err = ubifs_add_dirt(c, lnum, len);
+        if (!err)
+                err = dbg_check_tnc(c, 0);
+out_unlock:
+        mutex_unlock(&c->tnc_mutex);
+        return err;
+}
+/**
+ * ubifs_tnc_add_nm - add a "hashed" node to TNC.
+ * @c: UBIFS file-system description object
+ * @key: key to add
+ * @lnum: LEB number of node
+ * @offs: node offset
+ * @len: node length
+ * @nm: node name
+ *
+ * This is the same as 'ubifs_tnc_add()' but it should be used with keys which
+ * may have collisions, like directory entry keys.
+ */
+int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
+                     int lnum, int offs, int len, const struct qstr *nm)
+{
+        int found, n, err = 0;
+        struct ubifs_znode *znode;
+        mutex_lock(&c->tnc_mutex);
+        dbg_tnc("LEB %d:%d, name '%.*s', key %s", lnum, offs, nm->len, nm->name,
+                DBGKEY(key));
+        found = lookup_level0_dirty(c, key, &znode, &n);
+        if (found < 0) {
+                err = found;
+                goto out_unlock;
+        }
+        if (found == 1) {
+                if (c->replaying)
+                        found = fallible_resolve_collision(c, key, &znode, &n,
+                                                           nm, 1);
+                else
+                        found = resolve_collision(c, key, &znode, &n, nm);
+                dbg_tnc("rc returned %d, znode %p, n %d", found, znode, n);
+                if (found < 0) {
+                        err = found;
+                        goto out_unlock;
+                }
+                /* Ensure the znode is dirtied */
+                if (znode->cnext || !ubifs_zn_dirty(znode)) {
+                            znode = dirty_cow_bottom_up(c, znode);
+                            if (IS_ERR(znode)) {
+                                    err = PTR_ERR(znode);
+                                    goto out_unlock;
+                            }
+                }
+                if (found == 1) {
+                        struct ubifs_zbranch *zbr = &znode->zbranch[n];
+                        lnc_free(zbr);
+                        err = ubifs_add_dirt(c, zbr->lnum, zbr->len);
+                        zbr->lnum = lnum;
+                        zbr->offs = offs;
+                        zbr->len = len;
+                        goto out_unlock;
+                }
+        }
+        if (!found) {
+                struct ubifs_zbranch zbr;
+                zbr.znode = NULL;
+                zbr.lnum = lnum;
+                zbr.offs = offs;
+                zbr.len = len;
+                key_copy(c, key, &zbr.key);
+                err = tnc_insert(c, znode, &zbr, n + 1);
+                if (err)
+                        goto out_unlock;
+                if (c->replaying) {
+                        /*
+                         * We did not find it in the index so there may be a
+                         * dangling branch still in the index. So we remove it
+                         * by passing 'ubifs_tnc_remove_nm()' the same key but
+                         * an unmatchable name.
+                         */
+                        struct qstr noname = { .len = 0, .name = "" };
+                        err = dbg_check_tnc(c, 0);
+                        mutex_unlock(&c->tnc_mutex);
+                        if (err)
+                                return err;
+                        return ubifs_tnc_remove_nm(c, key, &noname);
+                }
+        }
+out_unlock:
+        if (!err)
+                err = dbg_check_tnc(c, 0);
+        mutex_unlock(&c->tnc_mutex);
+        return err;
+}
+/**
+ * tnc_delete - delete a znode form TNC.
+ * @c: UBIFS file-system description object
+ * @znode: znode to delete from
+ * @n: zbranch slot number to delete
+ *
+ * This function deletes a leaf node from @n-th slot of @znode. Returns zero in
+ * case of success and a negative error code in case of failure.
+ */
+static int tnc_delete(struct ubifs_info *c, struct ubifs_znode *znode, int n)
+{
+        struct ubifs_zbranch *zbr;
+        struct ubifs_znode *zp;
+        int i, err;
+        /* Delete without merge for now */
+        ubifs_assert(znode->level == 0);
+        ubifs_assert(n >= 0 && n < c->fanout);
+        dbg_tnc("deleting %s", DBGKEY(&znode->zbranch[n].key));
+        zbr = &znode->zbranch[n];
+        lnc_free(zbr);
+        err = ubifs_add_dirt(c, zbr->lnum, zbr->len);
+        if (err) {
+                dbg_dump_znode(c, znode);
+                return err;
+        }
+        /* We do not "gap" zbranch slots */
+        for (i = n; i < znode->child_cnt - 1; i++)
+                znode->zbranch[i] = znode->zbranch[i + 1];
+        znode->child_cnt -= 1;
+        if (znode->child_cnt > 0)
+                return 0;
+        /*
+         * This was the last zbranch, we have to delete this znode from the
+         * parent.
+         */
+        do {
+                ubifs_assert(!test_bit(OBSOLETE_ZNODE, &znode->flags));
+                ubifs_assert(ubifs_zn_dirty(znode));
+                zp = znode->parent;
+                n = znode->iip;
+                atomic_long_dec(&c->dirty_zn_cnt);
+                err = insert_old_idx_znode(c, znode);
+                if (err)
+                        return err;
+                if (znode->cnext) {
+                        __set_bit(OBSOLETE_ZNODE, &znode->flags);
+                        atomic_long_inc(&c->clean_zn_cnt);
+                        atomic_long_inc(&ubifs_clean_zn_cnt);
+                } else
+                        kfree(znode);
+                znode = zp;
+        } while (znode->child_cnt == 1); /* while removing last child */
+        /* Remove from znode, entry n - 1 */
+        znode->child_cnt -= 1;
+        ubifs_assert(znode->level != 0);
+        for (i = n; i < znode->child_cnt; i++) {
+                znode->zbranch[i] = znode->zbranch[i + 1];
+                if (znode->zbranch[i].znode)
+                        znode->zbranch[i].znode->iip = i;
+        }
+        /*
+         * If this is the root and it has only 1 child then
+         * collapse the tree.
+         */
+        if (!znode->parent) {
+                while (znode->child_cnt == 1 && znode->level != 0) {
+                        zp = znode;
+                        zbr = &znode->zbranch[0];
+                        znode = get_znode(c, znode, 0);
+                        if (IS_ERR(znode))
+                                return PTR_ERR(znode);
+                        znode = dirty_cow_znode(c, zbr);
+                        if (IS_ERR(znode))
+                                return PTR_ERR(znode);
+                        znode->parent = NULL;
+                        znode->iip = 0;
+                        if (c->zroot.len) {
+                                err = insert_old_idx(c, c->zroot.lnum,
+                                                     c->zroot.offs);
+                                if (err)
+                                        return err;
+                        }
+                        c->zroot.lnum = zbr->lnum;
+                        c->zroot.offs = zbr->offs;
+                        c->zroot.len = zbr->len;
+                        c->zroot.znode = znode;
+                        ubifs_assert(!test_bit(OBSOLETE_ZNODE,
+                                     &zp->flags));
+                        ubifs_assert(test_bit(DIRTY_ZNODE, &zp->flags));
+                        atomic_long_dec(&c->dirty_zn_cnt);
+                        if (zp->cnext) {
+                                __set_bit(OBSOLETE_ZNODE, &zp->flags);
+                                atomic_long_inc(&c->clean_zn_cnt);
+                                atomic_long_inc(&ubifs_clean_zn_cnt);
+                        } else
+                                kfree(zp);
+                }
+        }
+        return 0;
+}
+/**
+ * ubifs_tnc_remove - remove an index entry of a node.
+ * @c: UBIFS file-system description object
+ * @key: key of node
+ *
+ * Returns %0 on success or negative error code on failure.
+ */
+int ubifs_tnc_remove(struct ubifs_info *c, const union ubifs_key *key)
+{
+        int found, n, err = 0;
+        struct ubifs_znode *znode;
+        mutex_lock(&c->tnc_mutex);
+        dbg_tnc("key %s", DBGKEY(key));
+        found = lookup_level0_dirty(c, key, &znode, &n);
+        if (found < 0) {
+                err = found;
+                goto out_unlock;
+        }
+        if (found == 1)
+                err = tnc_delete(c, znode, n);
+        if (!err)
+                err = dbg_check_tnc(c, 0);
+out_unlock:
+        mutex_unlock(&c->tnc_mutex);
+        return err;
+}
+/**
+ * ubifs_tnc_remove_nm - remove an index entry for a "hashed" node.
+ * @c: UBIFS file-system description object
+ * @key: key of node
+ * @nm: directory entry name
+ *
+ * Returns %0 on success or negative error code on failure.
+ */
+int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key,
+                        const struct qstr *nm)
+{
+        int n, err;
+        struct ubifs_znode *znode;
+        mutex_lock(&c->tnc_mutex);
+        dbg_tnc("%.*s, key %s", nm->len, nm->name, DBGKEY(key));
+        err = lookup_level0_dirty(c, key, &znode, &n);
+        if (err < 0)
+                goto out_unlock;
+        if (err) {
+                if (c->replaying)
+                        err = fallible_resolve_collision(c, key, &znode, &n,
+                                                         nm, 0);
+                else
+                        err = resolve_collision(c, key, &znode, &n, nm);
+                dbg_tnc("rc returned %d, znode %p, n %d", err, znode, n);
+                if (err < 0)
+                        goto out_unlock;
+                if (err) {
+                        /* Ensure the znode is dirtied */
+                        if (znode->cnext || !ubifs_zn_dirty(znode)) {
+                                    znode = dirty_cow_bottom_up(c, znode);
+                                    if (IS_ERR(znode)) {
+                                            err = PTR_ERR(znode);
+                                            goto out_unlock;
+                                    }
+                        }
+                        err = tnc_delete(c, znode, n);
+                }
+        }
+out_unlock:
+        if (!err)
+                err = dbg_check_tnc(c, 0);
+        mutex_unlock(&c->tnc_mutex);
+        return err;
+}
+/**
+ * key_in_range - determine if a key falls within a range of keys.
+ * @c: UBIFS file-system description object
+ * @key: key to check
+ * @from_key: lowest key in range
+ * @to_key: highest key in range
+ *
+ * This function returns %1 if the key is in range and %0 otherwise.
+ */
+static int key_in_range(struct ubifs_info *c, union ubifs_key *key,
+                        union ubifs_key *from_key, union ubifs_key *to_key)
+{
+        if (keys_cmp(c, key, from_key) < 0)
+                return 0;
+        if (keys_cmp(c, key, to_key) > 0)
+                return 0;
+        return 1;
+}
+/**
+ * ubifs_tnc_remove_range - remove index entries in range.
+ * @c: UBIFS file-system description object
+ * @from_key: lowest key to remove
+ * @to_key: highest key to remove
+ *
+ * This function removes index entries starting at @from_key and ending at
+ * @to_key.  This function returns zero in case of success and a negative error
+ * code in case of failure.
+ */
+int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key,
+                           union ubifs_key *to_key)
+{
+        int i, n, k, err = 0;
+        struct ubifs_znode *znode;
+        union ubifs_key *key;
+        mutex_lock(&c->tnc_mutex);
+        while (1) {
+                /* Find first level 0 znode that contains keys to remove */
+                err = ubifs_lookup_level0(c, from_key, &znode, &n);
+                if (err < 0)
+                        goto out_unlock;
+                if (err)
+                        key = from_key;
+                else {
+                        err = tnc_next(c, &znode, &n);
+                        if (err == -ENOENT) {
+                                err = 0;
+                                goto out_unlock;
+                        }
+                        if (err < 0)
+                                goto out_unlock;
+                        key = &znode->zbranch[n].key;
+                        if (!key_in_range(c, key, from_key, to_key)) {
+                                err = 0;
+                                goto out_unlock;
+                        }
+                }
+                /* Ensure the znode is dirtied */
+                if (znode->cnext || !ubifs_zn_dirty(znode)) {
+                            znode = dirty_cow_bottom_up(c, znode);
+                            if (IS_ERR(znode)) {
+                                    err = PTR_ERR(znode);
+                                    goto out_unlock;
+                            }
+                }
+                /* Remove all keys in range except the first */
+                for (i = n + 1, k = 0; i < znode->child_cnt; i++, k++) {
+                        key = &znode->zbranch[i].key;
+                        if (!key_in_range(c, key, from_key, to_key))
+                                break;
+                        lnc_free(&znode->zbranch[i]);
+                        err = ubifs_add_dirt(c, znode->zbranch[i].lnum,
+                                             znode->zbranch[i].len);
+                        if (err) {
+                                dbg_dump_znode(c, znode);
+                                goto out_unlock;
+                        }
+                        dbg_tnc("removing %s", DBGKEY(key));
+                }
+                if (k) {
+                        for (i = n + 1 + k; i < znode->child_cnt; i++)
+                                znode->zbranch[i - k] = znode->zbranch[i];
+                        znode->child_cnt -= k;
+                }
+                /* Now delete the first */
+                err = tnc_delete(c, znode, n);
+                if (err)
+                        goto out_unlock;
+        }
+out_unlock:
+        if (!err)
+                err = dbg_check_tnc(c, 0);
+        mutex_unlock(&c->tnc_mutex);
+        return err;
+}
+/**
+ * ubifs_tnc_remove_ino - remove an inode from TNC.
+ * @c: UBIFS file-system description object
+ * @inum: inode number to remove
+ *
+ * This function remove inode @inum and all the extended attributes associated
+ * with the anode from TNC and returns zero in case of success or a negative
+ * error code in case of failure.
+ */
+int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum)
+{
+        union ubifs_key key1, key2;
+        struct ubifs_dent_node *xent, *pxent = NULL;
+        struct qstr nm = { .name = NULL };
+        dbg_tnc("ino %lu", inum);
+        /*
+         * Walk all extended attribute entries and remove them together with
+         * corresponding extended attribute inodes.
+         */
+        lowest_xent_key(c, &key1, inum);
+        while (1) {
+                ino_t xattr_inum;
+                int err;
+                xent = ubifs_tnc_next_ent(c, &key1, &nm);
+                if (IS_ERR(xent)) {
+                        err = PTR_ERR(xent);
+                        if (err == -ENOENT)
+                                break;
+                        return err;
+                }
+                xattr_inum = le64_to_cpu(xent->inum);
+                dbg_tnc("xent '%s', ino %lu", xent->name, xattr_inum);
+                nm.name = xent->name;
+                nm.len = le16_to_cpu(xent->nlen);
+                err = ubifs_tnc_remove_nm(c, &key1, &nm);
+                if (err) {
+                        kfree(xent);
+                        return err;
+                }
+                lowest_ino_key(c, &key1, xattr_inum);
+                highest_ino_key(c, &key2, xattr_inum);
+                err = ubifs_tnc_remove_range(c, &key1, &key2);
+                if (err) {
+                        kfree(xent);
+                        return err;
+                }
+                kfree(pxent);
+                pxent = xent;
+                key_read(c, &xent->key, &key1);
+        }
+        kfree(pxent);
+        lowest_ino_key(c, &key1, inum);
+        highest_ino_key(c, &key2, inum);
+        return ubifs_tnc_remove_range(c, &key1, &key2);
+}
+/**
+ * ubifs_tnc_next_ent - walk directory or extended attribute entries.
+ * @c: UBIFS file-system description object
+ * @key: key of last entry
+ * @nm: name of last entry found or %NULL
+ *
+ * This function finds and reads the next directory or extended attribute entry
+ * after the given key (@key) if there is one. @nm is used to resolve
+ * collisions.
+ *
+ * If the name of the current entry is not known and only the key is known,
+ * @nm->name has to be %NULL. In this case the semantics of this function is a
+ * little bit different and it returns the entry corresponding to this key, not
+ * the next one. If the key was not found, the closest "right" entry is
+ * returned.
+ *
+ * If the fist entry has to be found, @key has to contain the lowest possible
+ * key value for this inode and @name has to be %NULL.
+ *
+ * This function returns the found directory or extended attribute entry node
+ * in case of success, %-ENOENT is returned if no entry was found, and a
+ * negative error code is returned in case of failure.
+ */
+struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c,
+                                           union ubifs_key *key,
+                                           const struct qstr *nm)
+{
+        int n, err, type = key_type(c, key);
+        struct ubifs_znode *znode;
+        struct ubifs_dent_node *dent;
+        struct ubifs_zbranch *zbr;
+        union ubifs_key *dkey;
+        dbg_tnc("%s %s", nm->name ? (char *)nm->name : "(lowest)", DBGKEY(key));
+        ubifs_assert(is_hash_key(c, key));
+        mutex_lock(&c->tnc_mutex);
+        err = ubifs_lookup_level0(c, key, &znode, &n);
+        if (unlikely(err < 0))
+                goto out_unlock;
+        if (nm->name) {
+                if (err) {
+                        /* Handle collisions */
+                        err = resolve_collision(c, key, &znode, &n, nm);
+                        dbg_tnc("rc returned %d, znode %p, n %d",
+                                err, znode, n);
+                        if (unlikely(err < 0))
+                                goto out_unlock;
+                }
+                /* Now find next entry */
+                err = tnc_next(c, &znode, &n);
+                if (unlikely(err))
+                        goto out_unlock;
+        } else {
+                /*
+                 * The full name of the entry was not given, in which case the
+                 * behavior of this function is a little different and it
+                 * returns current entry, not the next one.
+                 */
+                if (!err) {
+                        /*
+                         * However, the given key does not exist in the TNC
+                         * tree and @znode/@n variables contain the closest
+                         * "preceding" element. Switch to the next one.
+                         */
+                        err = tnc_next(c, &znode, &n);
+                        if (err)
+                                goto out_unlock;
+                }
+        }
+        zbr = &znode->zbranch[n];
+        dent = kmalloc(zbr->len, GFP_NOFS);
+        if (unlikely(!dent)) {
+                err = -ENOMEM;
+                goto out_unlock;
+        }
+        /*
+         * The above 'tnc_next()' call could lead us to the next inode, check
+         * this.
+         */
+        dkey = &zbr->key;
+        if (key_inum(c, dkey) != key_inum(c, key) ||
+            key_type(c, dkey) != type) {
+                err = -ENOENT;
+                goto out_free;
+        }
+        err = tnc_read_node_nm(c, zbr, dent);
+        if (unlikely(err))
+                goto out_free;
+        mutex_unlock(&c->tnc_mutex);
+        return dent;
+out_free:
+        kfree(dent);
+out_unlock:
+        mutex_unlock(&c->tnc_mutex);
+        return ERR_PTR(err);
+}
+/**
+ * tnc_destroy_cnext - destroy left-over obsolete znodes from a failed commit.
+ * @c: UBIFS file-system description object
+ *
+ * Destroy left-over obsolete znodes from a failed commit.
+ */
+static void tnc_destroy_cnext(struct ubifs_info *c)
+{
+        struct ubifs_znode *cnext;
+        if (!c->cnext)
+                return;
+        ubifs_assert(c->cmt_state == COMMIT_BROKEN);
+        cnext = c->cnext;
+        do {
+                struct ubifs_znode *znode = cnext;
+                cnext = cnext->cnext;
+                if (test_bit(OBSOLETE_ZNODE, &znode->flags))
+                        kfree(znode);
+        } while (cnext && cnext != c->cnext);
+}
+/**
+ * ubifs_tnc_close - close TNC subsystem and free all related resources.
+ * @c: UBIFS file-system description object
+ */
+void ubifs_tnc_close(struct ubifs_info *c)
+{
+        long clean_freed;
+        tnc_destroy_cnext(c);
+        if (c->zroot.znode) {
+                clean_freed = ubifs_destroy_tnc_subtree(c->zroot.znode);
+                atomic_long_sub(clean_freed, &ubifs_clean_zn_cnt);
+        }
+        kfree(c->gap_lebs);
+        kfree(c->ilebs);
+        destroy_old_idx(c);
+}
+/**
+ * left_znode - get the znode to the left.
+ * @c: UBIFS file-system description object
+ * @znode: znode
+ *
+ * This function returns a pointer to the znode to the left of @znode or NULL if
+ * there is not one. A negative error code is returned on failure.
+ */
+static struct ubifs_znode *left_znode(struct ubifs_info *c,
+                                      struct ubifs_znode *znode)
+{
+        int level = znode->level;
+        while (1) {
+                int n = znode->iip - 1;
+                /* Go up until we can go left */
+                znode = znode->parent;
+                if (!znode)
+                        return NULL;
+                if (n >= 0) {
+                        /* Now go down the rightmost branch to 'level' */
+                        znode = get_znode(c, znode, n);
+                        if (IS_ERR(znode))
+                                return znode;
+                        while (znode->level != level) {
+                                n = znode->child_cnt - 1;
+                                znode = get_znode(c, znode, n);
+                                if (IS_ERR(znode))
+                                        return znode;
+                        }
+                        break;
+                }
+        }
+        return znode;
+}
+/**
+ * right_znode - get the znode to the right.
+ * @c: UBIFS file-system description object
+ * @znode: znode
+ *
+ * This function returns a pointer to the znode to the right of @znode or NULL
+ * if there is not one. A negative error code is returned on failure.
+ */
+static struct ubifs_znode *right_znode(struct ubifs_info *c,
+                                       struct ubifs_znode *znode)
+{
+        int level = znode->level;
+        while (1) {
+                int n = znode->iip + 1;
+                /* Go up until we can go right */
+                znode = znode->parent;
+                if (!znode)
+                        return NULL;
+                if (n < znode->child_cnt) {
+                        /* Now go down the leftmost branch to 'level' */
+                        znode = get_znode(c, znode, n);
+                        if (IS_ERR(znode))
+                                return znode;
+                        while (znode->level != level) {
+                                znode = get_znode(c, znode, 0);
+                                if (IS_ERR(znode))
+                                        return znode;
+                        }
+                        break;
+                }
+        }
+        return znode;
+}
+/**
+ * lookup_znode - find a particular indexing node from TNC.
+ * @c: UBIFS file-system description object
+ * @key: index node key to lookup
+ * @level: index node level
+ * @lnum: index node LEB number
+ * @offs: index node offset
+ *
+ * This function searches an indexing node by its first key @key and its
+ * address @lnum:@offs. It looks up the indexing tree by pulling all indexing
+ * nodes it traverses to TNC. This function is called fro indexing nodes which
+ * were found on the media by scanning, for example when garbage-collecting or
+ * when doing in-the-gaps commit. This means that the indexing node which is
+ * looked for does not have to have exactly the same leftmost key @key, because
+ * the leftmost key may have been changed, in which case TNC will contain a
+ * dirty znode which still refers the same @lnum:@offs. This function is clever
+ * enough to recognize such indexing nodes.
+ *
+ * Note, if a znode was deleted or changed too much, then this function will
+ * not find it. For situations like this UBIFS has the old index RB-tree
+ * (indexed by @lnum:@offs).
+ *
+ * This function returns a pointer to the znode found or %NULL if it is not
+ * found. A negative error code is returned on failure.
+ */
+static struct ubifs_znode *lookup_znode(struct ubifs_info *c,
+                                        union ubifs_key *key, int level,
+                                        int lnum, int offs)
+{
+        struct ubifs_znode *znode, *zn;
+        int n, nn;
+        /*
+         * The arguments have probably been read off flash, so don't assume
+         * they are valid.
+         */
+        if (level < 0)
+                return ERR_PTR(-EINVAL);
+        /* Get the root znode */
+        znode = c->zroot.znode;
+        if (!znode) {
+                znode = ubifs_load_znode(c, &c->zroot, NULL, 0);
+                if (IS_ERR(znode))
+                        return znode;
+        }
+        /* Check if it is the one we are looking for */
+        if (c->zroot.lnum == lnum && c->zroot.offs == offs)
+                return znode;
+        /* Descend to the parent level i.e. (level + 1) */
+        if (level >= znode->level)
+                return NULL;
+        while (1) {
+                ubifs_search_zbranch(c, znode, key, &n);
+                if (n < 0) {
+                        /*
+                         * We reached a znode where the leftmost key is greater
+                         * than the key we are searching for. This is the same
+                         * situation as the one described in a huge comment at
+                         * the end of the 'ubifs_lookup_level0()' function. And
+                         * for exactly the same reasons we have to try to look
+                         * left before giving up.
+                         */
+                        znode = left_znode(c, znode);
+                        if (!znode)
+                                return NULL;
+                        if (IS_ERR(znode))
+                                return znode;
+                        ubifs_search_zbranch(c, znode, key, &n);
+                        ubifs_assert(n >= 0);
+                }
+                if (znode->level == level + 1)
+                        break;
+                znode = get_znode(c, znode, n);
+                if (IS_ERR(znode))
+                        return znode;
+        }
+        /* Check if the child is the one we are looking for */
+        if (znode->zbranch[n].lnum == lnum && znode->zbranch[n].offs == offs)
+                return get_znode(c, znode, n);
+        /* If the key is unique, there is nowhere else to look */
+        if (!is_hash_key(c, key))
+                return NULL;
+        /*
+         * The key is not unique and so may be also in the znodes to either
+         * side.
+         */
+        zn = znode;
+        nn = n;
+        /* Look left */
+        while (1) {
+                /* Move one branch to the left */
+                if (n)
+                        n -= 1;
+                else {
+                        znode = left_znode(c, znode);
+                        if (!znode)
+                                break;
+                        if (IS_ERR(znode))
+                                return znode;
+                        n = znode->child_cnt - 1;
+                }
+                /* Check it */
+                if (znode->zbranch[n].lnum == lnum &&
+                    znode->zbranch[n].offs == offs)
+                        return get_znode(c, znode, n);
+                /* Stop if the key is less than the one we are looking for */
+                if (keys_cmp(c, &znode->zbranch[n].key, key) < 0)
+                        break;
+        }
+        /* Back to the middle */
+        znode = zn;
+        n = nn;
+        /* Look right */
+        while (1) {
+                /* Move one branch to the right */
+                if (++n >= znode->child_cnt) {
+                        znode = right_znode(c, znode);
+                        if (!znode)
+                                break;
+                        if (IS_ERR(znode))
+                                return znode;
+                        n = 0;
+                }
+                /* Check it */
+                if (znode->zbranch[n].lnum == lnum &&
+                    znode->zbranch[n].offs == offs)
+                        return get_znode(c, znode, n);
+                /* Stop if the key is greater than the one we are looking for */
+                if (keys_cmp(c, &znode->zbranch[n].key, key) > 0)
+                        break;
+        }
+        return NULL;
+}
+/**
+ * is_idx_node_in_tnc - determine if an index node is in the TNC.
+ * @c: UBIFS file-system description object
+ * @key: key of index node
+ * @level: index node level
+ * @lnum: LEB number of index node
+ * @offs: offset of index node
+ *
+ * This function returns %0 if the index node is not referred to in the TNC, %1
+ * if the index node is referred to in the TNC and the corresponding znode is
+ * dirty, %2 if an index node is referred to in the TNC and the corresponding
+ * znode is clean, and a negative error code in case of failure.
+ *
+ * Note, the @key argument has to be the key of the first child. Also note,
+ * this function relies on the fact that 0:0 is never a valid LEB number and
+ * offset for a main-area node.
+ */
+int is_idx_node_in_tnc(struct ubifs_info *c, union ubifs_key *key, int level,
+                       int lnum, int offs)
+{
+        struct ubifs_znode *znode;
+        znode = lookup_znode(c, key, level, lnum, offs);
+        if (!znode)
+                return 0;
+        if (IS_ERR(znode))
+                return PTR_ERR(znode);
+        return ubifs_zn_dirty(znode) ? 1 : 2;
+}
+/**
+ * is_leaf_node_in_tnc - determine if a non-indexing not is in the TNC.
+ * @c: UBIFS file-system description object
+ * @key: node key
+ * @lnum: node LEB number
+ * @offs: node offset
+ *
+ * This function returns %1 if the node is referred to in the TNC, %0 if it is
+ * not, and a negative error code in case of failure.
+ *
+ * Note, this function relies on the fact that 0:0 is never a valid LEB number
+ * and offset for a main-area node.
+ */
+static int is_leaf_node_in_tnc(struct ubifs_info *c, union ubifs_key *key,
+                               int lnum, int offs)
+{
+        struct ubifs_zbranch *zbr;
+        struct ubifs_znode *znode, *zn;
+        int n, found, err, nn;
+        const int unique = !is_hash_key(c, key);
+        found = ubifs_lookup_level0(c, key, &znode, &n);
+        if (found < 0)
+                return found; /* Error code */
+        if (!found)
+                return 0;
+        zbr = &znode->zbranch[n];
+        if (lnum == zbr->lnum && offs == zbr->offs)
+                return 1; /* Found it */
+        if (unique)
+                return 0;
+        /*
+         * Because the key is not unique, we have to look left
+         * and right as well
+         */
+        zn = znode;
+        nn = n;
+        /* Look left */
+        while (1) {
+                err = tnc_prev(c, &znode, &n);
+                if (err == -ENOENT)
+                        break;
+                if (err)
+                        return err;
+                if (keys_cmp(c, key, &znode->zbranch[n].key))
+                        break;
+                zbr = &znode->zbranch[n];
+                if (lnum == zbr->lnum && offs == zbr->offs)
+                        return 1; /* Found it */
+        }
+        /* Look right */
+        znode = zn;
+        n = nn;
+        while (1) {
+                err = tnc_next(c, &znode, &n);
+                if (err) {
+                        if (err == -ENOENT)
+                                return 0;
+                        return err;
+                }
+                if (keys_cmp(c, key, &znode->zbranch[n].key))
+                        break;
+                zbr = &znode->zbranch[n];
+                if (lnum == zbr->lnum && offs == zbr->offs)
+                        return 1; /* Found it */
+        }
+        return 0;
+}
+/**
+ * ubifs_tnc_has_node - determine whether a node is in the TNC.
+ * @c: UBIFS file-system description object
+ * @key: node key
+ * @level: index node level (if it is an index node)
+ * @lnum: node LEB number
+ * @offs: node offset
+ * @is_idx: non-zero if the node is an index node
+ *
+ * This function returns %1 if the node is in the TNC, %0 if it is not, and a
+ * negative error code in case of failure. For index nodes, @key has to be the
+ * key of the first child. An index node is considered to be in the TNC only if
+ * the corresponding znode is clean or has not been loaded.
+ */
+int ubifs_tnc_has_node(struct ubifs_info *c, union ubifs_key *key, int level,
+                       int lnum, int offs, int is_idx)
+{
+        int err;
+        mutex_lock(&c->tnc_mutex);
+        if (is_idx) {
+                err = is_idx_node_in_tnc(c, key, level, lnum, offs);
+                if (err < 0)
+                        goto out_unlock;
+                if (err == 1)
+                        /* The index node was found but it was dirty */
+                        err = 0;
+                else if (err == 2)
+                        /* The index node was found and it was clean */
+                        err = 1;
+                else
+                        BUG_ON(err != 0);
+        } else
+                err = is_leaf_node_in_tnc(c, key, lnum, offs);
+out_unlock:
+        mutex_unlock(&c->tnc_mutex);
+        return err;
+}
+/**
+ * ubifs_dirty_idx_node - dirty an index node.
+ * @c: UBIFS file-system description object
+ * @key: index node key
+ * @level: index node level
+ * @lnum: index node LEB number
+ * @offs: index node offset
+ *
+ * This function loads and dirties an index node so that it can be garbage
+ * collected. The @key argument has to be the key of the first child. This
+ * function relies on the fact that 0:0 is never a valid LEB number and offset
+ * for a main-area node. Returns %0 on success and a negative error code on
+ * failure.
+ */
+int ubifs_dirty_idx_node(struct ubifs_info *c, union ubifs_key *key, int level,
+                         int lnum, int offs)
+{
+        struct ubifs_znode *znode;
+        int err = 0;
+        mutex_lock(&c->tnc_mutex);
+        znode = lookup_znode(c, key, level, lnum, offs);
+        if (!znode)
+                goto out_unlock;
+        if (IS_ERR(znode)) {
+                err = PTR_ERR(znode);
+                goto out_unlock;
+        }
+        znode = dirty_cow_bottom_up(c, znode);
+        if (IS_ERR(znode)) {
+                err = PTR_ERR(znode);
+                goto out_unlock;
+        }
+out_unlock:
+        mutex_unlock(&c->tnc_mutex);
+        return err;
+}
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
new file mode 100644
index 000000000000..8117e65ba2e9
--- /dev/null
+++ b/fs/ubifs/tnc_commit.c
@@ -0,0 +1,1103 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+/* This file implements TNC functions for committing */
+#include "ubifs.h"
+/**
+ * make_idx_node - make an index node for fill-the-gaps method of TNC commit.
+ * @c: UBIFS file-system description object
+ * @idx: buffer in which to place new index node
+ * @znode: znode from which to make new index node
+ * @lnum: LEB number where new index node will be written
+ * @offs: offset where new index node will be written
+ * @len: length of new index node
+ */
+static int make_idx_node(struct ubifs_info *c, struct ubifs_idx_node *idx,
+                         struct ubifs_znode *znode, int lnum, int offs, int len)
+{
+        struct ubifs_znode *zp;
+        int i, err;
+        /* Make index node */
+        idx->ch.node_type = UBIFS_IDX_NODE;
+        idx->child_cnt = cpu_to_le16(znode->child_cnt);
+        idx->level = cpu_to_le16(znode->level);
+        for (i = 0; i < znode->child_cnt; i++) {
+                struct ubifs_branch *br = ubifs_idx_branch(c, idx, i);
+                struct ubifs_zbranch *zbr = &znode->zbranch[i];
+                key_write_idx(c, &zbr->key, &br->key);
+                br->lnum = cpu_to_le32(zbr->lnum);
+                br->offs = cpu_to_le32(zbr->offs);
+                br->len = cpu_to_le32(zbr->len);
+                if (!zbr->lnum || !zbr->len) {
+                        ubifs_err("bad ref in znode");
+                        dbg_dump_znode(c, znode);
+                        if (zbr->znode)
+                                dbg_dump_znode(c, zbr->znode);
+                }
+        }
+        ubifs_prepare_node(c, idx, len, 0);
+#ifdef CONFIG_UBIFS_FS_DEBUG
+        znode->lnum = lnum;
+        znode->offs = offs;
+        znode->len = len;
+#endif
+        err = insert_old_idx_znode(c, znode);
+        /* Update the parent */
+        zp = znode->parent;
+        if (zp) {
+                struct ubifs_zbranch *zbr;
+                zbr = &zp->zbranch[znode->iip];
+                zbr->lnum = lnum;
+                zbr->offs = offs;
+                zbr->len = len;
+        } else {
+                c->zroot.lnum = lnum;
+                c->zroot.offs = offs;
+                c->zroot.len = len;
+        }
+        c->calc_idx_sz += ALIGN(len, 8);
+        atomic_long_dec(&c->dirty_zn_cnt);
+        ubifs_assert(ubifs_zn_dirty(znode));
+        ubifs_assert(test_bit(COW_ZNODE, &znode->flags));
+        __clear_bit(DIRTY_ZNODE, &znode->flags);
+        __clear_bit(COW_ZNODE, &znode->flags);
+        return err;
+}
+/**
+ * fill_gap - make index nodes in gaps in dirty index LEBs.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number that gap appears in
+ * @gap_start: offset of start of gap
+ * @gap_end: offset of end of gap
+ * @dirt: adds dirty space to this
+ *
+ * This function returns the number of index nodes written into the gap.
+ */
+static int fill_gap(struct ubifs_info *c, int lnum, int gap_start, int gap_end,
+                    int *dirt)
+{
+        int len, gap_remains, gap_pos, written, pad_len;
+        ubifs_assert((gap_start & 7) == 0);
+        ubifs_assert((gap_end & 7) == 0);
+        ubifs_assert(gap_end >= gap_start);
+        gap_remains = gap_end - gap_start;
+        if (!gap_remains)
+                return 0;
+        gap_pos = gap_start;
+        written = 0;
+        while (c->enext) {
+                len = ubifs_idx_node_sz(c, c->enext->child_cnt);
+                if (len < gap_remains) {
+                        struct ubifs_znode *znode = c->enext;
+                        const int alen = ALIGN(len, 8);
+                        int err;
+                        ubifs_assert(alen <= gap_remains);
+                        err = make_idx_node(c, c->ileb_buf + gap_pos, znode,
+                                            lnum, gap_pos, len);
+                        if (err)
+                                return err;
+                        gap_remains -= alen;
+                        gap_pos += alen;
+                        c->enext = znode->cnext;
+                        if (c->enext == c->cnext)
+                                c->enext = NULL;
+                        written += 1;
+                } else
+                        break;
+        }
+        if (gap_end == c->leb_size) {
+                c->ileb_len = ALIGN(gap_pos, c->min_io_size);
+                /* Pad to end of min_io_size */
+                pad_len = c->ileb_len - gap_pos;
+        } else
+                /* Pad to end of gap */
+                pad_len = gap_remains;
+        dbg_gc("LEB %d:%d to %d len %d nodes written %d wasted bytes %d",
+               lnum, gap_start, gap_end, gap_end - gap_start, written, pad_len);
+        ubifs_pad(c, c->ileb_buf + gap_pos, pad_len);
+        *dirt += pad_len;
+        return written;
+}
+/**
+ * find_old_idx - find an index node obsoleted since the last commit start.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number of obsoleted index node
+ * @offs: offset of obsoleted index node
+ *
+ * Returns %1 if found and %0 otherwise.
+ */
+static int find_old_idx(struct ubifs_info *c, int lnum, int offs)
+{
+        struct ubifs_old_idx *o;
+        struct rb_node *p;
+        p = c->old_idx.rb_node;
+        while (p) {
+                o = rb_entry(p, struct ubifs_old_idx, rb);
+                if (lnum < o->lnum)
+                        p = p->rb_left;
+                else if (lnum > o->lnum)
+                        p = p->rb_right;
+                else if (offs < o->offs)
+                        p = p->rb_left;
+                else if (offs > o->offs)
+                        p = p->rb_right;
+                else
+                        return 1;
+        }
+        return 0;
+}
+/**
+ * is_idx_node_in_use - determine if an index node can be overwritten.
+ * @c: UBIFS file-system description object
+ * @key: key of index node
+ * @level: index node level
+ * @lnum: LEB number of index node
+ * @offs: offset of index node
+ *
+ * If @key / @lnum / @offs identify an index node that was not part of the old
+ * index, then this function returns %0 (obsolete).  Else if the index node was
+ * part of the old index but is now dirty %1 is returned, else if it is clean %2
+ * is returned. A negative error code is returned on failure.
+ */
+static int is_idx_node_in_use(struct ubifs_info *c, union ubifs_key *key,
+                              int level, int lnum, int offs)
+{
+        int ret;
+        ret = is_idx_node_in_tnc(c, key, level, lnum, offs);
+        if (ret < 0)
+                return ret; /* Error code */
+        if (ret == 0)
+                if (find_old_idx(c, lnum, offs))
+                        return 1;
+        return ret;
+}
+/**
+ * layout_leb_in_gaps - layout index nodes using in-the-gaps method.
+ * @c: UBIFS file-system description object
+ * @p: return LEB number here
+ *
+ * This function lays out new index nodes for dirty znodes using in-the-gaps
+ * method of TNC commit.
+ * This function merely puts the next znode into the next gap, making no attempt
+ * to try to maximise the number of znodes that fit.
+ * This function returns the number of index nodes written into the gaps, or a
+ * negative error code on failure.
+ */
+static int layout_leb_in_gaps(struct ubifs_info *c, int *p)
+{
+        struct ubifs_scan_leb *sleb;
+        struct ubifs_scan_node *snod;
+        int lnum, dirt = 0, gap_start, gap_end, err, written, tot_written;
+        tot_written = 0;
+        /* Get an index LEB with lots of obsolete index nodes */
+        lnum = ubifs_find_dirty_idx_leb(c);
+        if (lnum < 0)
+                /*
+                 * There also may be dirt in the index head that could be
+                 * filled, however we do not check there at present.
+                 */
+                return lnum; /* Error code */
+        *p = lnum;
+        dbg_gc("LEB %d", lnum);
+        /*
+         * Scan the index LEB.  We use the generic scan for this even though
+         * it is more comprehensive and less efficient than is needed for this
+         * purpose.
+         */
+        sleb = ubifs_scan(c, lnum, 0, c->ileb_buf);
+        c->ileb_len = 0;
+        if (IS_ERR(sleb))
+                return PTR_ERR(sleb);
+        gap_start = 0;
+        list_for_each_entry(snod, &sleb->nodes, list) {
+                struct ubifs_idx_node *idx;
+                int in_use, level;
+                ubifs_assert(snod->type == UBIFS_IDX_NODE);
+                idx = snod->node;
+                key_read(c, ubifs_idx_key(c, idx), &snod->key);
+                level = le16_to_cpu(idx->level);
+                /* Determine if the index node is in use (not obsolete) */
+                in_use = is_idx_node_in_use(c, &snod->key, level, lnum,
+                                            snod->offs);
+                if (in_use < 0) {
+                        ubifs_scan_destroy(sleb);
+                        return in_use; /* Error code */
+                }
+                if (in_use) {
+                        if (in_use == 1)
+                                dirt += ALIGN(snod->len, 8);
+                        /*
+                         * The obsolete index nodes form gaps that can be
+                         * overwritten.  This gap has ended because we have
+                         * found an index node that is still in use
+                         * i.e. not obsolete
+                         */
+                        gap_end = snod->offs;
+                        /* Try to fill gap */
+                        written = fill_gap(c, lnum, gap_start, gap_end, &dirt);
+                        if (written < 0) {
+                                ubifs_scan_destroy(sleb);
+                                return written; /* Error code */
+                        }
+                        tot_written += written;
+                        gap_start = ALIGN(snod->offs + snod->len, 8);
+                }
+        }
+        ubifs_scan_destroy(sleb);
+        c->ileb_len = c->leb_size;
+        gap_end = c->leb_size;
+        /* Try to fill gap */
+        written = fill_gap(c, lnum, gap_start, gap_end, &dirt);
+        if (written < 0)
+                return written; /* Error code */
+        tot_written += written;
+        if (tot_written == 0) {
+                struct ubifs_lprops lp;
+                dbg_gc("LEB %d wrote %d index nodes", lnum, tot_written);
+                err = ubifs_read_one_lp(c, lnum, &lp);
+                if (err)
+                        return err;
+                if (lp.free == c->leb_size) {
+                        /*
+                         * We must have snatched this LEB from the idx_gc list
+                         * so we need to correct the free and dirty space.
+                         */
+                        err = ubifs_change_one_lp(c, lnum,
+                                                  c->leb_size - c->ileb_len,
+                                                  dirt, 0, 0, 0);
+                        if (err)
+                                return err;
+                }
+                return 0;
+        }
+        err = ubifs_change_one_lp(c, lnum, c->leb_size - c->ileb_len, dirt,
+                                  0, 0, 0);
+        if (err)
+                return err;
+        err = ubifs_leb_change(c, lnum, c->ileb_buf, c->ileb_len,
+                               UBI_SHORTTERM);
+        if (err)
+                return err;
+        dbg_gc("LEB %d wrote %d index nodes", lnum, tot_written);
+        return tot_written;
+}
+/**
+ * get_leb_cnt - calculate the number of empty LEBs needed to commit.
+ * @c: UBIFS file-system description object
+ * @cnt: number of znodes to commit
+ *
+ * This function returns the number of empty LEBs needed to commit @cnt znodes
+ * to the current index head.  The number is not exact and may be more than
+ * needed.
+ */
+static int get_leb_cnt(struct ubifs_info *c, int cnt)
+{
+        int d;
+        /* Assume maximum index node size (i.e. overestimate space needed) */
+        cnt -= (c->leb_size - c->ihead_offs) / c->max_idx_node_sz;
+        if (cnt < 0)
+                cnt = 0;
+        d = c->leb_size / c->max_idx_node_sz;
+        return DIV_ROUND_UP(cnt, d);
+}
+/**
+ * layout_in_gaps - in-the-gaps method of committing TNC.
+ * @c: UBIFS file-system description object
+ * @cnt: number of dirty znodes to commit.
+ *
+ * This function lays out new index nodes for dirty znodes using in-the-gaps
+ * method of TNC commit.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int layout_in_gaps(struct ubifs_info *c, int cnt)
+{
+        int err, leb_needed_cnt, written, *p;
+        dbg_gc("%d znodes to write", cnt);
+        c->gap_lebs = kmalloc(sizeof(int) * (c->lst.idx_lebs + 1), GFP_NOFS);
+        if (!c->gap_lebs)
+                return -ENOMEM;
+        p = c->gap_lebs;
+        do {
+                ubifs_assert(p < c->gap_lebs + sizeof(int) * c->lst.idx_lebs);
+                written = layout_leb_in_gaps(c, p);
+                if (written < 0) {
+                        err = written;
+                        if (err == -ENOSPC) {
+                                if (!dbg_force_in_the_gaps_enabled) {
+                                        /*
+                                         * Do not print scary warnings if the
+                                         * debugging option which forces
+                                         * in-the-gaps is enabled.
+                                         */
+                                        ubifs_err("out of space");
+                                        spin_lock(&c->space_lock);
+                                        dbg_dump_budg(c);
+                                        spin_unlock(&c->space_lock);
+                                        dbg_dump_lprops(c);
+                                }
+                                /* Try to commit anyway */
+                                err = 0;
+                                break;
+                        }
+                        kfree(c->gap_lebs);
+                        c->gap_lebs = NULL;
+                        return err;
+                }
+                p++;
+                cnt -= written;
+                leb_needed_cnt = get_leb_cnt(c, cnt);
+                dbg_gc("%d znodes remaining, need %d LEBs, have %d", cnt,
+                       leb_needed_cnt, c->ileb_cnt);
+        } while (leb_needed_cnt > c->ileb_cnt);
+        *p = -1;
+        return 0;
+}
+/**
+ * layout_in_empty_space - layout index nodes in empty space.
+ * @c: UBIFS file-system description object
+ *
+ * This function lays out new index nodes for dirty znodes using empty LEBs.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int layout_in_empty_space(struct ubifs_info *c)
+{
+        struct ubifs_znode *znode, *cnext, *zp;
+        int lnum, offs, len, next_len, buf_len, buf_offs, used, avail;
+        int wlen, blen, err;
+        cnext = c->enext;
+        if (!cnext)
+                return 0;
+        lnum = c->ihead_lnum;
+        buf_offs = c->ihead_offs;
+        buf_len = ubifs_idx_node_sz(c, c->fanout);
+        buf_len = ALIGN(buf_len, c->min_io_size);
+        used = 0;
+        avail = buf_len;
+        /* Ensure there is enough room for first write */
+        next_len = ubifs_idx_node_sz(c, cnext->child_cnt);
+        if (buf_offs + next_len > c->leb_size)
+                lnum = -1;
+        while (1) {
+                znode = cnext;
+                len = ubifs_idx_node_sz(c, znode->child_cnt);
+                /* Determine the index node position */
+                if (lnum == -1) {
+                        if (c->ileb_nxt >= c->ileb_cnt) {
+                                ubifs_err("out of space");
+                                return -ENOSPC;
+                        }
+                        lnum = c->ilebs[c->ileb_nxt++];
+                        buf_offs = 0;
+                        used = 0;
+                        avail = buf_len;
+                }
+                offs = buf_offs + used;
+#ifdef CONFIG_UBIFS_FS_DEBUG
+                znode->lnum = lnum;
+                znode->offs = offs;
+                znode->len = len;
+#endif
+                /* Update the parent */
+                zp = znode->parent;
+                if (zp) {
+                        struct ubifs_zbranch *zbr;
+                        int i;
+                        i = znode->iip;
+                        zbr = &zp->zbranch[i];
+                        zbr->lnum = lnum;
+                        zbr->offs = offs;
+                        zbr->len = len;
+                } else {
+                        c->zroot.lnum = lnum;
+                        c->zroot.offs = offs;
+                        c->zroot.len = len;
+                }
+                c->calc_idx_sz += ALIGN(len, 8);
+                /*
+                 * Once lprops is updated, we can decrease the dirty znode count
+                 * but it is easier to just do it here.
+                 */
+                atomic_long_dec(&c->dirty_zn_cnt);
+                /*
+                 * Calculate the next index node length to see if there is
+                 * enough room for it
+                 */
+                cnext = znode->cnext;
+                if (cnext == c->cnext)
+                        next_len = 0;
+                else
+                        next_len = ubifs_idx_node_sz(c, cnext->child_cnt);
+                if (c->min_io_size == 1) {
+                        buf_offs += ALIGN(len, 8);
+                        if (next_len) {
+                                if (buf_offs + next_len <= c->leb_size)
+                                        continue;
+                                err = ubifs_update_one_lp(c, lnum, 0,
+                                                c->leb_size - buf_offs, 0, 0);
+                                if (err)
+                                        return err;
+                                lnum = -1;
+                                continue;
+                        }
+                        err = ubifs_update_one_lp(c, lnum,
+                                        c->leb_size - buf_offs, 0, 0, 0);
+                        if (err)
+                                return err;
+                        break;
+                }
+                /* Update buffer positions */
+                wlen = used + len;
+                used += ALIGN(len, 8);
+                avail -= ALIGN(len, 8);
+                if (next_len != 0 &&
+                    buf_offs + used + next_len <= c->leb_size &&
+                    avail > 0)
+                        continue;
+                if (avail <= 0 && next_len &&
+                    buf_offs + used + next_len <= c->leb_size)
+                        blen = buf_len;
+                else
+                        blen = ALIGN(wlen, c->min_io_size);
+                /* The buffer is full or there are no more znodes to do */
+                buf_offs += blen;
+                if (next_len) {
+                        if (buf_offs + next_len > c->leb_size) {
+                                err = ubifs_update_one_lp(c, lnum,
+                                        c->leb_size - buf_offs, blen - used,
+                                        0, 0);
+                                if (err)
+                                        return err;
+                                lnum = -1;
+                        }
+                        used -= blen;
+                        if (used < 0)
+                                used = 0;
+                        avail = buf_len - used;
+                        continue;
+                }
+                err = ubifs_update_one_lp(c, lnum, c->leb_size - buf_offs,
+                                          blen - used, 0, 0);
+                if (err)
+                        return err;
+                break;
+        }
+#ifdef CONFIG_UBIFS_FS_DEBUG
+        c->new_ihead_lnum = lnum;
+        c->new_ihead_offs = buf_offs;
+#endif
+        return 0;
+}
+/**
+ * layout_commit - determine positions of index nodes to commit.
+ * @c: UBIFS file-system description object
+ * @no_space: indicates that insufficient empty LEBs were allocated
+ * @cnt: number of znodes to commit
+ *
+ * Calculate and update the positions of index nodes to commit.  If there were
+ * an insufficient number of empty LEBs allocated, then index nodes are placed
+ * into the gaps created by obsolete index nodes in non-empty index LEBs.  For
+ * this purpose, an obsolete index node is one that was not in the index as at
+ * the end of the last commit.  To write "in-the-gaps" requires that those index
+ * LEBs are updated atomically in-place.
+ */
+static int layout_commit(struct ubifs_info *c, int no_space, int cnt)
+{
+        int err;
+        if (no_space) {
+                err = layout_in_gaps(c, cnt);
+                if (err)
+                        return err;
+        }
+        err = layout_in_empty_space(c);
+        return err;
+}
+/**
+ * find_first_dirty - find first dirty znode.
+ * @znode: znode to begin searching from
+ */
+static struct ubifs_znode *find_first_dirty(struct ubifs_znode *znode)
+{
+        int i, cont;
+        if (!znode)
+                return NULL;
+        while (1) {
+                if (znode->level == 0) {
+                        if (ubifs_zn_dirty(znode))
+                                return znode;
+                        return NULL;
+                }
+                cont = 0;
+                for (i = 0; i < znode->child_cnt; i++) {
+                        struct ubifs_zbranch *zbr = &znode->zbranch[i];
+                        if (zbr->znode && ubifs_zn_dirty(zbr->znode)) {
+                                znode = zbr->znode;
+                                cont = 1;
+                                break;
+                        }
+                }
+                if (!cont) {
+                        if (ubifs_zn_dirty(znode))
+                                return znode;
+                        return NULL;
+                }
+        }
+}
+/**
+ * find_next_dirty - find next dirty znode.
+ * @znode: znode to begin searching from
+ */
+static struct ubifs_znode *find_next_dirty(struct ubifs_znode *znode)
+{
+        int n = znode->iip + 1;
+        znode = znode->parent;
+        if (!znode)
+                return NULL;
+        for (; n < znode->child_cnt; n++) {
+                struct ubifs_zbranch *zbr = &znode->zbranch[n];
+                if (zbr->znode && ubifs_zn_dirty(zbr->znode))
+                        return find_first_dirty(zbr->znode);
+        }
+        return znode;
+}
+/**
+ * get_znodes_to_commit - create list of dirty znodes to commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns the number of znodes to commit.
+ */
+static int get_znodes_to_commit(struct ubifs_info *c)
+{
+        struct ubifs_znode *znode, *cnext;
+        int cnt = 0;
+        c->cnext = find_first_dirty(c->zroot.znode);
+        znode = c->enext = c->cnext;
+        if (!znode) {
+                dbg_cmt("no znodes to commit");
+                return 0;
+        }
+        cnt += 1;
+        while (1) {
+                ubifs_assert(!test_bit(COW_ZNODE, &znode->flags));
+                __set_bit(COW_ZNODE, &znode->flags);
+                znode->alt = 0;
+                cnext = find_next_dirty(znode);
+                if (!cnext) {
+                        znode->cnext = c->cnext;
+                        break;
+                }
+                znode->cnext = cnext;
+                znode = cnext;
+                cnt += 1;
+        }
+        dbg_cmt("committing %d znodes", cnt);
+        ubifs_assert(cnt == atomic_long_read(&c->dirty_zn_cnt));
+        return cnt;
+}
+/**
+ * alloc_idx_lebs - allocate empty LEBs to be used to commit.
+ * @c: UBIFS file-system description object
+ * @cnt: number of znodes to commit
+ *
+ * This function returns %-ENOSPC if it cannot allocate a sufficient number of
+ * empty LEBs.  %0 is returned on success, otherwise a negative error code
+ * is returned.
+ */
+static int alloc_idx_lebs(struct ubifs_info *c, int cnt)
+{
+        int i, leb_cnt, lnum;
+        c->ileb_cnt = 0;
+        c->ileb_nxt = 0;
+        leb_cnt = get_leb_cnt(c, cnt);
+        dbg_cmt("need about %d empty LEBS for TNC commit", leb_cnt);
+        if (!leb_cnt)
+                return 0;
+        c->ilebs = kmalloc(leb_cnt * sizeof(int), GFP_NOFS);
+        if (!c->ilebs)
+                return -ENOMEM;
+        for (i = 0; i < leb_cnt; i++) {
+                lnum = ubifs_find_free_leb_for_idx(c);
+                if (lnum < 0)
+                        return lnum;
+                c->ilebs[c->ileb_cnt++] = lnum;
+                dbg_cmt("LEB %d", lnum);
+        }
+        if (dbg_force_in_the_gaps())
+                return -ENOSPC;
+        return 0;
+}
+/**
+ * free_unused_idx_lebs - free unused LEBs that were allocated for the commit.
+ * @c: UBIFS file-system description object
+ *
+ * It is possible that we allocate more empty LEBs for the commit than we need.
+ * This functions frees the surplus.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int free_unused_idx_lebs(struct ubifs_info *c)
+{
+        int i, err = 0, lnum, er;
+        for (i = c->ileb_nxt; i < c->ileb_cnt; i++) {
+                lnum = c->ilebs[i];
+                dbg_cmt("LEB %d", lnum);
+                er = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
+                                         LPROPS_INDEX | LPROPS_TAKEN, 0);
+                if (!err)
+                        err = er;
+        }
+        return err;
+}
+/**
+ * free_idx_lebs - free unused LEBs after commit end.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int free_idx_lebs(struct ubifs_info *c)
+{
+        int err;
+        err = free_unused_idx_lebs(c);
+        kfree(c->ilebs);
+        c->ilebs = NULL;
+        return err;
+}
+/**
+ * ubifs_tnc_start_commit - start TNC commit.
+ * @c: UBIFS file-system description object
+ * @zroot: new index root position is returned here
+ *
+ * This function prepares the list of indexing nodes to commit and lays out
+ * their positions on flash. If there is not enough free space it uses the
+ * in-gap commit method. Returns zero in case of success and a negative error
+ * code in case of failure.
+ */
+int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot)
+{
+        int err = 0, cnt;
+        mutex_lock(&c->tnc_mutex);
+        err = dbg_check_tnc(c, 1);
+        if (err)
+                goto out;
+        cnt = get_znodes_to_commit(c);
+        if (cnt != 0) {
+                int no_space = 0;
+                err = alloc_idx_lebs(c, cnt);
+                if (err == -ENOSPC)
+                        no_space = 1;
+                else if (err)
+                        goto out_free;
+                err = layout_commit(c, no_space, cnt);
+                if (err)
+                        goto out_free;
+                ubifs_assert(atomic_long_read(&c->dirty_zn_cnt) == 0);
+                err = free_unused_idx_lebs(c);
+                if (err)
+                        goto out;
+        }
+        destroy_old_idx(c);
+        memcpy(zroot, &c->zroot, sizeof(struct ubifs_zbranch));
+        err = ubifs_save_dirty_idx_lnums(c);
+        if (err)
+                goto out;
+        spin_lock(&c->space_lock);
+        /*
+         * Although we have not finished committing yet, update size of the
+         * committed index ('c->old_idx_sz') and zero out the index growth
+         * budget. It is OK to do this now, because we've reserved all the
+         * space which is needed to commit the index, and it is save for the
+         * budgeting subsystem to assume the index is already committed,
+         * even though it is not.
+         */
+        c->old_idx_sz = c->calc_idx_sz;
+        c->budg_uncommitted_idx = 0;
+        spin_unlock(&c->space_lock);
+        mutex_unlock(&c->tnc_mutex);
+        dbg_cmt("number of index LEBs %d", c->lst.idx_lebs);
+        dbg_cmt("size of index %llu", c->calc_idx_sz);
+        return err;
+out_free:
+        free_idx_lebs(c);
+out:
+        mutex_unlock(&c->tnc_mutex);
+        return err;
+}
+/**
+ * write_index - write index nodes.
+ * @c: UBIFS file-system description object
+ *
+ * This function writes the index nodes whose positions were laid out in the
+ * layout_in_empty_space function.
+ */
+static int write_index(struct ubifs_info *c)
+{
+        struct ubifs_idx_node *idx;
+        struct ubifs_znode *znode, *cnext;
+        int i, lnum, offs, len, next_len, buf_len, buf_offs, used;
+        int avail, wlen, err, lnum_pos = 0;
+        cnext = c->enext;
+        if (!cnext)
+                return 0;
+        /*
+         * Always write index nodes to the index head so that index nodes and
+         * other types of nodes are never mixed in the same erase block.
+         */
+        lnum = c->ihead_lnum;
+        buf_offs = c->ihead_offs;
+        /* Allocate commit buffer */
+        buf_len = ALIGN(c->max_idx_node_sz, c->min_io_size);
+        used = 0;
+        avail = buf_len;
+        /* Ensure there is enough room for first write */
+        next_len = ubifs_idx_node_sz(c, cnext->child_cnt);
+        if (buf_offs + next_len > c->leb_size) {
+                err = ubifs_update_one_lp(c, lnum, LPROPS_NC, 0, 0,
+                                          LPROPS_TAKEN);
+                if (err)
+                        return err;
+                lnum = -1;
+        }
+        while (1) {
+                cond_resched();
+                znode = cnext;
+                idx = c->cbuf + used;
+                /* Make index node */
+                idx->ch.node_type = UBIFS_IDX_NODE;
+                idx->child_cnt = cpu_to_le16(znode->child_cnt);
+                idx->level = cpu_to_le16(znode->level);
+                for (i = 0; i < znode->child_cnt; i++) {
+                        struct ubifs_branch *br = ubifs_idx_branch(c, idx, i);
+                        struct ubifs_zbranch *zbr = &znode->zbranch[i];
+                        key_write_idx(c, &zbr->key, &br->key);
+                        br->lnum = cpu_to_le32(zbr->lnum);
+                        br->offs = cpu_to_le32(zbr->offs);
+                        br->len = cpu_to_le32(zbr->len);
+                        if (!zbr->lnum || !zbr->len) {
+                                ubifs_err("bad ref in znode");
+                                dbg_dump_znode(c, znode);
+                                if (zbr->znode)
+                                        dbg_dump_znode(c, zbr->znode);
+                        }
+                }
+                len = ubifs_idx_node_sz(c, znode->child_cnt);
+                ubifs_prepare_node(c, idx, len, 0);
+                /* Determine the index node position */
+                if (lnum == -1) {
+                        lnum = c->ilebs[lnum_pos++];
+                        buf_offs = 0;
+                        used = 0;
+                        avail = buf_len;
+                }
+                offs = buf_offs + used;
+#ifdef CONFIG_UBIFS_FS_DEBUG
+                if (lnum != znode->lnum || offs != znode->offs ||
+                    len != znode->len) {
+                        ubifs_err("inconsistent znode posn");
+                        return -EINVAL;
+                }
+#endif
+                /* Grab some stuff from znode while we still can */
+                cnext = znode->cnext;
+                ubifs_assert(ubifs_zn_dirty(znode));
+                ubifs_assert(test_bit(COW_ZNODE, &znode->flags));
+                /*
+                 * It is important that other threads should see %DIRTY_ZNODE
+                 * flag cleared before %COW_ZNODE. Specifically, it matters in
+                 * the 'dirty_cow_znode()' function. This is the reason for the
+                 * first barrier. Also, we want the bit changes to be seen to
+                 * other threads ASAP, to avoid unnecesarry copying, which is
+                 * the reason for the second barrier.
+                 */
+                clear_bit(DIRTY_ZNODE, &znode->flags);
+                smp_mb__before_clear_bit();
+                clear_bit(COW_ZNODE, &znode->flags);
+                smp_mb__after_clear_bit();
+                /* Do not access znode from this point on */
+                /* Update buffer positions */
+                wlen = used + len;
+                used += ALIGN(len, 8);
+                avail -= ALIGN(len, 8);
+                /*
+                 * Calculate the next index node length to see if there is
+                 * enough room for it
+                 */
+                if (cnext == c->cnext)
+                        next_len = 0;
+                else
+                        next_len = ubifs_idx_node_sz(c, cnext->child_cnt);
+                if (c->min_io_size == 1) {
+                        /*
+                         * Write the prepared index node immediately if there is
+                         * no minimum IO size
+                         */
+                        err = ubifs_leb_write(c, lnum, c->cbuf, buf_offs,
+                                              wlen, UBI_SHORTTERM);
+                        if (err)
+                                return err;
+                        buf_offs += ALIGN(wlen, 8);
+                        if (next_len) {
+                                used = 0;
+                                avail = buf_len;
+                                if (buf_offs + next_len > c->leb_size) {
+                                        err = ubifs_update_one_lp(c, lnum,
+                                                LPROPS_NC, 0, 0, LPROPS_TAKEN);
+                                        if (err)
+                                                return err;
+                                        lnum = -1;
+                                }
+                                continue;
+                        }
+                } else {
+                        int blen, nxt_offs = buf_offs + used + next_len;
+                        if (next_len && nxt_offs <= c->leb_size) {
+                                if (avail > 0)
+                                        continue;
+                                else
+                                        blen = buf_len;
+                        } else {
+                                wlen = ALIGN(wlen, 8);
+                                blen = ALIGN(wlen, c->min_io_size);
+                                ubifs_pad(c, c->cbuf + wlen, blen - wlen);
+                        }
+                        /*
+                         * The buffer is full or there are no more znodes
+                         * to do
+                         */
+                        err = ubifs_leb_write(c, lnum, c->cbuf, buf_offs,
+                                              blen, UBI_SHORTTERM);
+                        if (err)
+                                return err;
+                        buf_offs += blen;
+                        if (next_len) {
+                                if (nxt_offs > c->leb_size) {
+                                        err = ubifs_update_one_lp(c, lnum,
+                                                LPROPS_NC, 0, 0, LPROPS_TAKEN);
+                                        if (err)
+                                                return err;
+                                        lnum = -1;
+                                }
+                                used -= blen;
+                                if (used < 0)
+                                        used = 0;
+                                avail = buf_len - used;
+                                memmove(c->cbuf, c->cbuf + blen, used);
+                                continue;
+                        }
+                }
+                break;
+        }
+#ifdef CONFIG_UBIFS_FS_DEBUG
+        if (lnum != c->new_ihead_lnum || buf_offs != c->new_ihead_offs) {
+                ubifs_err("inconsistent ihead");
+                return -EINVAL;
+        }
+#endif
+        c->ihead_lnum = lnum;
+        c->ihead_offs = buf_offs;
+        return 0;
+}
+/**
+ * free_obsolete_znodes - free obsolete znodes.
+ * @c: UBIFS file-system description object
+ *
+ * At the end of commit end, obsolete znodes are freed.
+ */
+static void free_obsolete_znodes(struct ubifs_info *c)
+{
+        struct ubifs_znode *znode, *cnext;
+        cnext = c->cnext;
+        do {
+                znode = cnext;
+                cnext = znode->cnext;
+                if (test_bit(OBSOLETE_ZNODE, &znode->flags))
+                        kfree(znode);
+                else {
+                        znode->cnext = NULL;
+                        atomic_long_inc(&c->clean_zn_cnt);
+                        atomic_long_inc(&ubifs_clean_zn_cnt);
+                }
+        } while (cnext != c->cnext);
+}
+/**
+ * return_gap_lebs - return LEBs used by the in-gap commit method.
+ * @c: UBIFS file-system description object
+ *
+ * This function clears the "taken" flag for the LEBs which were used by the
+ * "commit in-the-gaps" method.
+ */
+static int return_gap_lebs(struct ubifs_info *c)
+{
+        int *p, err;
+        if (!c->gap_lebs)
+                return 0;
+        dbg_cmt("");
+        for (p = c->gap_lebs; *p != -1; p++) {
+                err = ubifs_change_one_lp(c, *p, LPROPS_NC, LPROPS_NC, 0,
+                                          LPROPS_TAKEN, 0);
+                if (err)
+                        return err;
+        }
+        kfree(c->gap_lebs);
+        c->gap_lebs = NULL;
+        return 0;
+}
+/**
+ * ubifs_tnc_end_commit - update the TNC for commit end.
+ * @c: UBIFS file-system description object
+ *
+ * Write the dirty znodes.
+ */
+int ubifs_tnc_end_commit(struct ubifs_info *c)
+{
+        int err;
+        if (!c->cnext)
+                return 0;
+        err = return_gap_lebs(c);
+        if (err)
+                return err;
+        err = write_index(c);
+        if (err)
+                return err;
+        mutex_lock(&c->tnc_mutex);
+        dbg_cmt("TNC height is %d", c->zroot.znode->level + 1);
+        free_obsolete_znodes(c);
+        c->cnext = NULL;
+        kfree(c->ilebs);
+        c->ilebs = NULL;
+        mutex_unlock(&c->tnc_mutex);
+        return 0;
+}
diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c
new file mode 100644
index 000000000000..a25c1cc1f8d9
--- /dev/null
+++ b/fs/ubifs/tnc_misc.c
@@ -0,0 +1,494 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+/*
+ * This file contains miscelanious TNC-related functions shared betweend
+ * different files. This file does not form any logically separate TNC
+ * sub-system. The file was created because there is a lot of TNC code and
+ * putting it all in one file would make that file too big and unreadable.
+ */
+#include "ubifs.h"
+/**
+ * ubifs_tnc_levelorder_next - next TNC tree element in levelorder traversal.
+ * @zr: root of the subtree to traverse
+ * @znode: previous znode
+ *
+ * This function implements levelorder TNC traversal. The LNC is ignored.
+ * Returns the next element or %NULL if @znode is already the last one.
+ */
+struct ubifs_znode *ubifs_tnc_levelorder_next(struct ubifs_znode *zr,
+                                              struct ubifs_znode *znode)
+{
+        int level, iip, level_search = 0;
+        struct ubifs_znode *zn;
+        ubifs_assert(zr);
+        if (unlikely(!znode))
+                return zr;
+        if (unlikely(znode == zr)) {
+                if (znode->level == 0)
+                        return NULL;
+                return ubifs_tnc_find_child(zr, 0);
+        }
+        level = znode->level;
+        iip = znode->iip;
+        while (1) {
+                ubifs_assert(znode->level <= zr->level);
+                /*
+                 * First walk up until there is a znode with next branch to
+                 * look at.
+                 */
+                while (znode->parent != zr && iip >= znode->parent->child_cnt) {
+                        znode = znode->parent;
+                        iip = znode->iip;
+                }
+                if (unlikely(znode->parent == zr &&
+                             iip >= znode->parent->child_cnt)) {
+                        /* This level is done, switch to the lower one */
+                        level -= 1;
+                        if (level_search || level < 0)
+                                /*
+                                 * We were already looking for znode at lower
+                                 * level ('level_search'). As we are here
+                                 * again, it just does not exist. Or all levels
+                                 * were finished ('level < 0').
+                                 */
+                                return NULL;
+                        level_search = 1;
+                        iip = -1;
+                        znode = ubifs_tnc_find_child(zr, 0);
+                        ubifs_assert(znode);
+                }
+                /* Switch to the next index */
+                zn = ubifs_tnc_find_child(znode->parent, iip + 1);
+                if (!zn) {
+                        /* No more children to look at, we have walk up */
+                        iip = znode->parent->child_cnt;
+                        continue;
+                }
+                /* Walk back down to the level we came from ('level') */
+                while (zn->level != level) {
+                        znode = zn;
+                        zn = ubifs_tnc_find_child(zn, 0);
+                        if (!zn) {
+                                /*
+                                 * This path is not too deep so it does not
+                                 * reach 'level'. Try next path.
+                                 */
+                                iip = znode->iip;
+                                break;
+                        }
+                }
+                if (zn) {
+                        ubifs_assert(zn->level >= 0);
+                        return zn;
+                }
+        }
+}
+/**
+ * ubifs_search_zbranch - search znode branch.
+ * @c: UBIFS file-system description object
+ * @znode: znode to search in
+ * @key: key to search for
+ * @n: znode branch slot number is returned here
+ *
+ * This is a helper function which search branch with key @key in @znode using
+ * binary search. The result of the search may be:
+ *   o exact match, then %1 is returned, and the slot number of the branch is
+ *     stored in @n;
+ *   o no exact match, then %0 is returned and the slot number of the left
+ *     closest branch is returned in @n; the slot if all keys in this znode are
+ *     greater than @key, then %-1 is returned in @n.
+ */
+int ubifs_search_zbranch(const struct ubifs_info *c,
+                         const struct ubifs_znode *znode,
+                         const union ubifs_key *key, int *n)
+{
+        int beg = 0, end = znode->child_cnt, uninitialized_var(mid);
+        int uninitialized_var(cmp);
+        const struct ubifs_zbranch *zbr = &znode->zbranch[0];
+        ubifs_assert(end > beg);
+        while (end > beg) {
+                mid = (beg + end) >> 1;
+                cmp = keys_cmp(c, key, &zbr[mid].key);
+                if (cmp > 0)
+                        beg = mid + 1;
+                else if (cmp < 0)
+                        end = mid;
+                else {
+                        *n = mid;
+                        return 1;
+                }
+        }
+        *n = end - 1;
+        /* The insert point is after *n */
+        ubifs_assert(*n >= -1 && *n < znode->child_cnt);
+        if (*n == -1)
+                ubifs_assert(keys_cmp(c, key, &zbr[0].key) < 0);
+        else
+                ubifs_assert(keys_cmp(c, key, &zbr[*n].key) > 0);
+        if (*n + 1 < znode->child_cnt)
+                ubifs_assert(keys_cmp(c, key, &zbr[*n + 1].key) < 0);
+        return 0;
+}
+/**
+ * ubifs_tnc_postorder_first - find first znode to do postorder tree traversal.
+ * @znode: znode to start at (root of the sub-tree to traverse)
+ *
+ * Find the lowest leftmost znode in a subtree of the TNC tree. The LNC is
+ * ignored.
+ */
+struct ubifs_znode *ubifs_tnc_postorder_first(struct ubifs_znode *znode)
+{
+        if (unlikely(!znode))
+                return NULL;
+        while (znode->level > 0) {
+                struct ubifs_znode *child;
+                child = ubifs_tnc_find_child(znode, 0);
+                if (!child)
+                        return znode;
+                znode = child;
+        }
+        return znode;
+}
+/**
+ * ubifs_tnc_postorder_next - next TNC tree element in postorder traversal.
+ * @znode: previous znode
+ *
+ * This function implements postorder TNC traversal. The LNC is ignored.
+ * Returns the next element or %NULL if @znode is already the last one.
+ */
+struct ubifs_znode *ubifs_tnc_postorder_next(struct ubifs_znode *znode)
+{
+        struct ubifs_znode *zn;
+        ubifs_assert(znode);
+        if (unlikely(!znode->parent))
+                return NULL;
+        /* Switch to the next index in the parent */
+        zn = ubifs_tnc_find_child(znode->parent, znode->iip + 1);
+        if (!zn)
+                /* This is in fact the last child, return parent */
+                return znode->parent;
+        /* Go to the first znode in this new subtree */
+        return ubifs_tnc_postorder_first(zn);
+}
+/**
+ * ubifs_destroy_tnc_subtree - destroy all znodes connected to a subtree.
+ * @znode: znode defining subtree to destroy
+ *
+ * This function destroys subtree of the TNC tree. Returns number of clean
+ * znodes in the subtree.
+ */
+long ubifs_destroy_tnc_subtree(struct ubifs_znode *znode)
+{
+        struct ubifs_znode *zn = ubifs_tnc_postorder_first(znode);
+        long clean_freed = 0;
+        int n;
+        ubifs_assert(zn);
+        while (1) {
+                for (n = 0; n < zn->child_cnt; n++) {
+                        if (!zn->zbranch[n].znode)
+                                continue;
+                        if (zn->level > 0 &&
+                            !ubifs_zn_dirty(zn->zbranch[n].znode))
+                                clean_freed += 1;
+                        cond_resched();
+                        kfree(zn->zbranch[n].znode);
+                }
+                if (zn == znode) {
+                        if (!ubifs_zn_dirty(zn))
+                                clean_freed += 1;
+                        kfree(zn);
+                        return clean_freed;
+                }
+                zn = ubifs_tnc_postorder_next(zn);
+        }
+}
+/**
+ * read_znode - read an indexing node from flash and fill znode.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB of the indexing node to read
+ * @offs: node offset
+ * @len: node length
+ * @znode: znode to read to
+ *
+ * This function reads an indexing node from the flash media and fills znode
+ * with the read data. Returns zero in case of success and a negative error
+ * code in case of failure. The read indexing node is validated and if anything
+ * is wrong with it, this function prints complaint messages and returns
+ * %-EINVAL.
+ */
+static int read_znode(struct ubifs_info *c, int lnum, int offs, int len,
+                      struct ubifs_znode *znode)
+{
+        int i, err, type, cmp;
+        struct ubifs_idx_node *idx;
+        idx = kmalloc(c->max_idx_node_sz, GFP_NOFS);
+        if (!idx)
+                return -ENOMEM;
+        err = ubifs_read_node(c, idx, UBIFS_IDX_NODE, len, lnum, offs);
+        if (err < 0) {
+                kfree(idx);
+                return err;
+        }
+        znode->child_cnt = le16_to_cpu(idx->child_cnt);
+        znode->level = le16_to_cpu(idx->level);
+        dbg_tnc("LEB %d:%d, level %d, %d branch",
+                lnum, offs, znode->level, znode->child_cnt);
+        if (znode->child_cnt > c->fanout || znode->level > UBIFS_MAX_LEVELS) {
+                dbg_err("current fanout %d, branch count %d",
+                        c->fanout, znode->child_cnt);
+                dbg_err("max levels %d, znode level %d",
+                        UBIFS_MAX_LEVELS, znode->level);
+                err = 1;
+                goto out_dump;
+        }
+        for (i = 0; i < znode->child_cnt; i++) {
+                const struct ubifs_branch *br = ubifs_idx_branch(c, idx, i);
+                struct ubifs_zbranch *zbr = &znode->zbranch[i];
+                key_read(c, &br->key, &zbr->key);
+                zbr->lnum = le32_to_cpu(br->lnum);
+                zbr->offs = le32_to_cpu(br->offs);
+                zbr->len  = le32_to_cpu(br->len);
+                zbr->znode = NULL;
+                /* Validate branch */
+                if (zbr->lnum < c->main_first ||
+                    zbr->lnum >= c->leb_cnt || zbr->offs < 0 ||
+                    zbr->offs + zbr->len > c->leb_size || zbr->offs & 7) {
+                        dbg_err("bad branch %d", i);
+                        err = 2;
+                        goto out_dump;
+                }
+                switch (key_type(c, &zbr->key)) {
+                case UBIFS_INO_KEY:
+                case UBIFS_DATA_KEY:
+                case UBIFS_DENT_KEY:
+                case UBIFS_XENT_KEY:
+                        break;
+                default:
+                        dbg_msg("bad key type at slot %d: %s", i,
+                                DBGKEY(&zbr->key));
+                        err = 3;
+                        goto out_dump;
+                }
+                if (znode->level)
+                        continue;
+                type = key_type(c, &zbr->key);
+                if (c->ranges[type].max_len == 0) {
+                        if (zbr->len != c->ranges[type].len) {
+                                dbg_err("bad target node (type %d) length (%d)",
+                                        type, zbr->len);
+                                dbg_err("have to be %d", c->ranges[type].len);
+                                err = 4;
+                                goto out_dump;
+                        }
+                } else if (zbr->len < c->ranges[type].min_len ||
+                           zbr->len > c->ranges[type].max_len) {
+                        dbg_err("bad target node (type %d) length (%d)",
+                                type, zbr->len);
+                        dbg_err("have to be in range of %d-%d",
+                                c->ranges[type].min_len,
+                                c->ranges[type].max_len);
+                        err = 5;
+                        goto out_dump;
+                }
+        }
+        /*
+         * Ensure that the next key is greater or equivalent to the
+         * previous one.
+         */
+        for (i = 0; i < znode->child_cnt - 1; i++) {
+                const union ubifs_key *key1, *key2;
+                key1 = &znode->zbranch[i].key;
+                key2 = &znode->zbranch[i + 1].key;
+                cmp = keys_cmp(c, key1, key2);
+                if (cmp > 0) {
+                        dbg_err("bad key order (keys %d and %d)", i, i + 1);
+                        err = 6;
+                        goto out_dump;
+                } else if (cmp == 0 && !is_hash_key(c, key1)) {
+                        /* These can only be keys with colliding hash */
+                        dbg_err("keys %d and %d are not hashed but equivalent",
+                                i, i + 1);
+                        err = 7;
+                        goto out_dump;
+                }
+        }
+        kfree(idx);
+        return 0;
+out_dump:
+        ubifs_err("bad indexing node at LEB %d:%d, error %d", lnum, offs, err);
+        dbg_dump_node(c, idx);
+        kfree(idx);
+        return -EINVAL;
+}
+/**
+ * ubifs_load_znode - load znode to TNC cache.
+ * @c: UBIFS file-system description object
+ * @zbr: znode branch
+ * @parent: znode's parent
+ * @iip: index in parent
+ *
+ * This function loads znode pointed to by @zbr into the TNC cache and
+ * returns pointer to it in case of success and a negative error code in case
+ * of failure.
+ */
+struct ubifs_znode *ubifs_load_znode(struct ubifs_info *c,
+                                     struct ubifs_zbranch *zbr,
+                                     struct ubifs_znode *parent, int iip)
+{
+        int err;
+        struct ubifs_znode *znode;
+        ubifs_assert(!zbr->znode);
+        /*
+         * A slab cache is not presently used for znodes because the znode size
+         * depends on the fanout which is stored in the superblock.
+         */
+        znode = kzalloc(c->max_znode_sz, GFP_NOFS);
+        if (!znode)
+                return ERR_PTR(-ENOMEM);
+        err = read_znode(c, zbr->lnum, zbr->offs, zbr->len, znode);
+        if (err)
+                goto out;
+        atomic_long_inc(&c->clean_zn_cnt);
+        /*
+         * Increment the global clean znode counter as well. It is OK that
+         * global and per-FS clean znode counters may be inconsistent for some
+         * short time (because we might be preempted at this point), the global
+         * one is only used in shrinker.
+         */
+        atomic_long_inc(&ubifs_clean_zn_cnt);
+        zbr->znode = znode;
+        znode->parent = parent;
+        znode->time = get_seconds();
+        znode->iip = iip;
+        return znode;
+out:
+        kfree(znode);
+        return ERR_PTR(err);
+}
+/**
+ * ubifs_tnc_read_node - read a leaf node from the flash media.
+ * @c: UBIFS file-system description object
+ * @zbr: key and position of the node
+ * @node: node is returned here
+ *
+ * This function reads a node defined by @zbr from the flash media. Returns
+ * zero in case of success or a negative negative error code in case of
+ * failure.
+ */
+int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+                        void *node)
+{
+        union ubifs_key key1, *key = &zbr->key;
+        int err, type = key_type(c, key);
+        struct ubifs_wbuf *wbuf;
+        /*
+         * 'zbr' has to point to on-flash node. The node may sit in a bud and
+         * may even be in a write buffer, so we have to take care about this.
+         */
+        wbuf = ubifs_get_wbuf(c, zbr->lnum);
+        if (wbuf)
+                err = ubifs_read_node_wbuf(wbuf, node, type, zbr->len,
+                                           zbr->lnum, zbr->offs);
+        else
+                err = ubifs_read_node(c, node, type, zbr->len, zbr->lnum,
+                                      zbr->offs);
+        if (err) {
+                dbg_tnc("key %s", DBGKEY(key));
+                return err;
+        }
+        /* Make sure the key of the read node is correct */
+        key_read(c, key, &key1);
+        if (memcmp(node + UBIFS_KEY_OFFSET, &key1, c->key_len)) {
+                ubifs_err("bad key in node at LEB %d:%d",
+                          zbr->lnum, zbr->offs);
+                dbg_tnc("looked for key %s found node's key %s",
+                        DBGKEY(key), DBGKEY1(&key1));
+                dbg_dump_node(c, node);
+                return -EINVAL;
+        }
+        return 0;
+}
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
new file mode 100644
index 000000000000..0cc7da9bed47
--- /dev/null
+++ b/fs/ubifs/ubifs-media.h
@@ -0,0 +1,745 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+/*
+ * This file describes UBIFS on-flash format and contains definitions of all the
+ * relevant data structures and constants.
+ *
+ * All UBIFS on-flash objects are stored in the form of nodes. All nodes start
+ * with the UBIFS node magic number and have the same common header. Nodes
+ * always sit at 8-byte aligned positions on the media and node header sizes are
+ * also 8-byte aligned (except for the indexing node and the padding node).
+ */
+#ifndef __UBIFS_MEDIA_H__
+#define __UBIFS_MEDIA_H__
+/* UBIFS node magic number (must not have the padding byte first or last) */
+#define UBIFS_NODE_MAGIC  0x06101831
+/* UBIFS on-flash format version */
+#define UBIFS_FORMAT_VERSION 4
+/* Minimum logical eraseblock size in bytes */
+#define UBIFS_MIN_LEB_SZ (15*1024)
+/* Initial CRC32 value used when calculating CRC checksums */
+#define UBIFS_CRC32_INIT 0xFFFFFFFFU
+/*
+ * UBIFS does not try to compress data if its length is less than the below
+ * constant.
+ */
+#define UBIFS_MIN_COMPR_LEN 128
+/* Root inode number */
+#define UBIFS_ROOT_INO 1
+/* Lowest inode number used for regular inodes (not UBIFS-only internal ones) */
+#define UBIFS_FIRST_INO 64
+/*
+ * Maximum file name and extended attribute length (must be a multiple of 8,
+ * minus 1).
+ */
+#define UBIFS_MAX_NLEN 255
+/* Maximum number of data journal heads */
+#define UBIFS_MAX_JHEADS 1
+/*
+ * Size of UBIFS data block. Note, UBIFS is not a block oriented file-system,
+ * which means that it does not treat the underlying media as consisting of
+ * blocks like in case of hard drives. Do not be confused. UBIFS block is just
+ * the maximum amount of data which one data node can have or which can be
+ * attached to an inode node.
+ */
+#define UBIFS_BLOCK_SIZE  4096
+#define UBIFS_BLOCK_SHIFT 12
+#define UBIFS_BLOCK_MASK  0x00000FFF
+/* UBIFS padding byte pattern (must not be first or last byte of node magic) */
+#define UBIFS_PADDING_BYTE 0xCE
+/* Maximum possible key length */
+#define UBIFS_MAX_KEY_LEN 16
+/* Key length ("simple" format) */
+#define UBIFS_SK_LEN 8
+/* Minimum index tree fanout */
+#define UBIFS_MIN_FANOUT 2
+/* Maximum number of levels in UBIFS indexing B-tree */
+#define UBIFS_MAX_LEVELS 512
+/* Maximum amount of data attached to an inode in bytes */
+#define UBIFS_MAX_INO_DATA UBIFS_BLOCK_SIZE
+/* LEB Properties Tree fanout (must be power of 2) and fanout shift */
+#define UBIFS_LPT_FANOUT 4
+#define UBIFS_LPT_FANOUT_SHIFT 2
+/* LEB Properties Tree bit field sizes */
+#define UBIFS_LPT_CRC_BITS 16
+#define UBIFS_LPT_CRC_BYTES 2
+#define UBIFS_LPT_TYPE_BITS 4
+/* The key is always at the same position in all keyed nodes */
+#define UBIFS_KEY_OFFSET offsetof(struct ubifs_ino_node, key)
+/*
+ * LEB Properties Tree node types.
+ *
+ * UBIFS_LPT_PNODE: LPT leaf node (contains LEB properties)
+ * UBIFS_LPT_NNODE: LPT internal node
+ * UBIFS_LPT_LTAB: LPT's own lprops table
+ * UBIFS_LPT_LSAVE: LPT's save table (big model only)
+ * UBIFS_LPT_NODE_CNT: count of LPT node types
+ * UBIFS_LPT_NOT_A_NODE: all ones (15 for 4 bits) is never a valid node type
+ */
+enum {
+        UBIFS_LPT_PNODE,
+        UBIFS_LPT_NNODE,
+        UBIFS_LPT_LTAB,
+        UBIFS_LPT_LSAVE,
+        UBIFS_LPT_NODE_CNT,
+        UBIFS_LPT_NOT_A_NODE = (1 << UBIFS_LPT_TYPE_BITS) - 1,
+};
+/*
+ * UBIFS inode types.
+ *
+ * UBIFS_ITYPE_REG: regular file
+ * UBIFS_ITYPE_DIR: directory
+ * UBIFS_ITYPE_LNK: soft link
+ * UBIFS_ITYPE_BLK: block device node
+ * UBIFS_ITYPE_CHR: character device node
+ * UBIFS_ITYPE_FIFO: fifo
+ * UBIFS_ITYPE_SOCK: socket
+ * UBIFS_ITYPES_CNT: count of supported file types
+ */
+enum {
+        UBIFS_ITYPE_REG,
+        UBIFS_ITYPE_DIR,
+        UBIFS_ITYPE_LNK,
+        UBIFS_ITYPE_BLK,
+        UBIFS_ITYPE_CHR,
+        UBIFS_ITYPE_FIFO,
+        UBIFS_ITYPE_SOCK,
+        UBIFS_ITYPES_CNT,
+};
+/*
+ * Supported key hash functions.
+ *
+ * UBIFS_KEY_HASH_R5: R5 hash
+ * UBIFS_KEY_HASH_TEST: test hash which just returns first 4 bytes of the name
+ */
+enum {
+        UBIFS_KEY_HASH_R5,
+        UBIFS_KEY_HASH_TEST,
+};
+/*
+ * Supported key formats.
+ *
+ * UBIFS_SIMPLE_KEY_FMT: simple key format
+ */
+enum {
+        UBIFS_SIMPLE_KEY_FMT,
+};
+/*
+ * The simple key format uses 29 bits for storing UBIFS block number and hash
+ * value.
+ */
+#define UBIFS_S_KEY_BLOCK_BITS 29
+#define UBIFS_S_KEY_BLOCK_MASK 0x1FFFFFFF
+#define UBIFS_S_KEY_HASH_BITS  UBIFS_S_KEY_BLOCK_BITS
+#define UBIFS_S_KEY_HASH_MASK  UBIFS_S_KEY_BLOCK_MASK
+/*
+ * Key types.
+ *
+ * UBIFS_INO_KEY: inode node key
+ * UBIFS_DATA_KEY: data node key
+ * UBIFS_DENT_KEY: directory entry node key
+ * UBIFS_XENT_KEY: extended attribute entry key
+ * UBIFS_KEY_TYPES_CNT: number of supported key types
+ */
+enum {
+        UBIFS_INO_KEY,
+        UBIFS_DATA_KEY,
+        UBIFS_DENT_KEY,
+        UBIFS_XENT_KEY,
+        UBIFS_KEY_TYPES_CNT,
+};
+/* Count of LEBs reserved for the superblock area */
+#define UBIFS_SB_LEBS 1
+/* Count of LEBs reserved for the master area */
+#define UBIFS_MST_LEBS 2
+/* First LEB of the superblock area */
+#define UBIFS_SB_LNUM 0
+/* First LEB of the master area */
+#define UBIFS_MST_LNUM (UBIFS_SB_LNUM + UBIFS_SB_LEBS)
+/* First LEB of the log area */
+#define UBIFS_LOG_LNUM (UBIFS_MST_LNUM + UBIFS_MST_LEBS)
+/*
+ * The below constants define the absolute minimum values for various UBIFS
+ * media areas. Many of them actually depend of flash geometry and the FS
+ * configuration (number of journal heads, orphan LEBs, etc). This means that
+ * the smallest volume size which can be used for UBIFS cannot be pre-defined
+ * by these constants. The file-system that meets the below limitation will not
+ * necessarily mount. UBIFS does run-time calculations and validates the FS
+ * size.
+ */
+/* Minimum number of logical eraseblocks in the log */
+#define UBIFS_MIN_LOG_LEBS 2
+/* Minimum number of bud logical eraseblocks (one for each head) */
+#define UBIFS_MIN_BUD_LEBS 3
+/* Minimum number of journal logical eraseblocks */
+#define UBIFS_MIN_JNL_LEBS (UBIFS_MIN_LOG_LEBS + UBIFS_MIN_BUD_LEBS)
+/* Minimum number of LPT area logical eraseblocks */
+#define UBIFS_MIN_LPT_LEBS 2
+/* Minimum number of orphan area logical eraseblocks */
+#define UBIFS_MIN_ORPH_LEBS 1
+/*
+ * Minimum number of main area logical eraseblocks (buds, 2 for the index, 1
+ * for GC, 1 for deletions, and at least 1 for committed data).
+ */
+#define UBIFS_MIN_MAIN_LEBS (UBIFS_MIN_BUD_LEBS + 5)
+/* Minimum number of logical eraseblocks */
+#define UBIFS_MIN_LEB_CNT (UBIFS_SB_LEBS + UBIFS_MST_LEBS + \
+                           UBIFS_MIN_LOG_LEBS + UBIFS_MIN_LPT_LEBS + \
+                           UBIFS_MIN_ORPH_LEBS + UBIFS_MIN_MAIN_LEBS)
+/* Node sizes (N.B. these are guaranteed to be multiples of 8) */
+#define UBIFS_CH_SZ        sizeof(struct ubifs_ch)
+#define UBIFS_INO_NODE_SZ  sizeof(struct ubifs_ino_node)
+#define UBIFS_DATA_NODE_SZ sizeof(struct ubifs_data_node)
+#define UBIFS_DENT_NODE_SZ sizeof(struct ubifs_dent_node)
+#define UBIFS_TRUN_NODE_SZ sizeof(struct ubifs_trun_node)
+#define UBIFS_PAD_NODE_SZ  sizeof(struct ubifs_pad_node)
+#define UBIFS_SB_NODE_SZ   sizeof(struct ubifs_sb_node)
+#define UBIFS_MST_NODE_SZ  sizeof(struct ubifs_mst_node)
+#define UBIFS_REF_NODE_SZ  sizeof(struct ubifs_ref_node)
+#define UBIFS_IDX_NODE_SZ  sizeof(struct ubifs_idx_node)
+#define UBIFS_CS_NODE_SZ   sizeof(struct ubifs_cs_node)
+#define UBIFS_ORPH_NODE_SZ sizeof(struct ubifs_orph_node)
+/* Extended attribute entry nodes are identical to directory entry nodes */
+#define UBIFS_XENT_NODE_SZ UBIFS_DENT_NODE_SZ
+/* Only this does not have to be multiple of 8 bytes */
+#define UBIFS_BRANCH_SZ    sizeof(struct ubifs_branch)
+/* Maximum node sizes (N.B. these are guaranteed to be multiples of 8) */
+#define UBIFS_MAX_DATA_NODE_SZ  (UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE)
+#define UBIFS_MAX_INO_NODE_SZ   (UBIFS_INO_NODE_SZ + UBIFS_MAX_INO_DATA)
+#define UBIFS_MAX_DENT_NODE_SZ  (UBIFS_DENT_NODE_SZ + UBIFS_MAX_NLEN + 1)
+#define UBIFS_MAX_XENT_NODE_SZ  UBIFS_MAX_DENT_NODE_SZ
+/* The largest UBIFS node */
+#define UBIFS_MAX_NODE_SZ UBIFS_MAX_INO_NODE_SZ
+/*
+ * On-flash inode flags.
+ *
+ * UBIFS_COMPR_FL: use compression for this inode
+ * UBIFS_SYNC_FL:  I/O on this inode has to be synchronous
+ * UBIFS_IMMUTABLE_FL: inode is immutable
+ * UBIFS_APPEND_FL: writes to the inode may only append data
+ * UBIFS_DIRSYNC_FL: I/O on this directory inode has to be synchronous
+ * UBIFS_XATTR_FL: this inode is the inode for an extended attribute value
+ *
+ * Note, these are on-flash flags which correspond to ioctl flags
+ * (@FS_COMPR_FL, etc). They have the same values now, but generally, do not
+ * have to be the same.
+ */
+enum {
+        UBIFS_COMPR_FL     = 0x01,
+        UBIFS_SYNC_FL      = 0x02,
+        UBIFS_IMMUTABLE_FL = 0x04,
+        UBIFS_APPEND_FL    = 0x08,
+        UBIFS_DIRSYNC_FL   = 0x10,
+        UBIFS_XATTR_FL     = 0x20,
+};
+/* Inode flag bits used by UBIFS */
+#define UBIFS_FL_MASK 0x0000001F
+/*
+ * UBIFS compression algorithms.
+ *
+ * UBIFS_COMPR_NONE: no compression
+ * UBIFS_COMPR_LZO: LZO compression
+ * UBIFS_COMPR_ZLIB: ZLIB compression
+ * UBIFS_COMPR_TYPES_CNT: count of supported compression types
+ */
+enum {
+        UBIFS_COMPR_NONE,
+        UBIFS_COMPR_LZO,
+        UBIFS_COMPR_ZLIB,
+        UBIFS_COMPR_TYPES_CNT,
+};
+/*
+ * UBIFS node types.
+ *
+ * UBIFS_INO_NODE: inode node
+ * UBIFS_DATA_NODE: data node
+ * UBIFS_DENT_NODE: directory entry node
+ * UBIFS_XENT_NODE: extended attribute node
+ * UBIFS_TRUN_NODE: truncation node
+ * UBIFS_PAD_NODE: padding node
+ * UBIFS_SB_NODE: superblock node
+ * UBIFS_MST_NODE: master node
+ * UBIFS_REF_NODE: LEB reference node
+ * UBIFS_IDX_NODE: index node
+ * UBIFS_CS_NODE: commit start node
+ * UBIFS_ORPH_NODE: orphan node
+ * UBIFS_NODE_TYPES_CNT: count of supported node types
+ *
+ * Note, we index arrays by these numbers, so keep them low and contiguous.
+ * Node type constants for inodes, direntries and so on have to be the same as
+ * corresponding key type constants.
+ */
+enum {
+        UBIFS_INO_NODE,
+        UBIFS_DATA_NODE,
+        UBIFS_DENT_NODE,
+        UBIFS_XENT_NODE,
+        UBIFS_TRUN_NODE,
+        UBIFS_PAD_NODE,
+        UBIFS_SB_NODE,
+        UBIFS_MST_NODE,
+        UBIFS_REF_NODE,
+        UBIFS_IDX_NODE,
+        UBIFS_CS_NODE,
+        UBIFS_ORPH_NODE,
+        UBIFS_NODE_TYPES_CNT,
+};
+/*
+ * Master node flags.
+ *
+ * UBIFS_MST_DIRTY: rebooted uncleanly - master node is dirty
+ * UBIFS_MST_NO_ORPHS: no orphan inodes present
+ * UBIFS_MST_RCVRY: written by recovery
+ */
+enum {
+        UBIFS_MST_DIRTY = 1,
+        UBIFS_MST_NO_ORPHS = 2,
+        UBIFS_MST_RCVRY = 4,
+};
+/*
+ * Node group type (used by recovery to recover whole group or none).
+ *
+ * UBIFS_NO_NODE_GROUP: this node is not part of a group
+ * UBIFS_IN_NODE_GROUP: this node is a part of a group
+ * UBIFS_LAST_OF_NODE_GROUP: this node is the last in a group
+ */
+enum {
+        UBIFS_NO_NODE_GROUP = 0,
+        UBIFS_IN_NODE_GROUP,
+        UBIFS_LAST_OF_NODE_GROUP,
+};
+/*
+ * Superblock flags.
+ *
+ * UBIFS_FLG_BIGLPT: if "big" LPT model is used if set
+ */
+enum {
+        UBIFS_FLG_BIGLPT = 0x02,
+};
+/**
+ * struct ubifs_ch - common header node.
+ * @magic: UBIFS node magic number (%UBIFS_NODE_MAGIC)
+ * @crc: CRC-32 checksum of the node header
+ * @sqnum: sequence number
+ * @len: full node length
+ * @node_type: node type
+ * @group_type: node group type
+ * @padding: reserved for future, zeroes
+ *
+ * Every UBIFS node starts with this common part. If the node has a key, the
+ * key always goes next.
+ */
+struct ubifs_ch {
+        __le32 magic;
+        __le32 crc;
+        __le64 sqnum;
+        __le32 len;
+        __u8 node_type;
+        __u8 group_type;
+        __u8 padding[2];
+} __attribute__ ((packed));
+/**
+ * union ubifs_dev_desc - device node descriptor.
+ * @new: new type device descriptor
+ * @huge: huge type device descriptor
+ *
+ * This data structure describes major/minor numbers of a device node. In an
+ * inode is a device node then its data contains an object of this type. UBIFS
+ * uses standard Linux "new" and "huge" device node encodings.
+ */
+union ubifs_dev_desc {
+        __le32 new;
+        __le64 huge;
+} __attribute__ ((packed));
+/**
+ * struct ubifs_ino_node - inode node.
+ * @ch: common header
+ * @key: node key
+ * @creat_sqnum: sequence number at time of creation
+ * @size: inode size in bytes (amount of uncompressed data)
+ * @atime_sec: access time seconds
+ * @ctime_sec: creation time seconds
+ * @mtime_sec: modification time seconds
+ * @atime_nsec: access time nanoseconds
+ * @ctime_nsec: creation time nanoseconds
+ * @mtime_nsec: modification time nanoseconds
+ * @nlink: number of hard links
+ * @uid: owner ID
+ * @gid: group ID
+ * @mode: access flags
+ * @flags: per-inode flags (%UBIFS_COMPR_FL, %UBIFS_SYNC_FL, etc)
+ * @data_len: inode data length
+ * @xattr_cnt: count of extended attributes this inode has
+ * @xattr_size: summarized size of all extended attributes in bytes
+ * @padding1: reserved for future, zeroes
+ * @xattr_names: sum of lengths of all extended attribute names belonging to
+ *               this inode
+ * @compr_type: compression type used for this inode
+ * @padding2: reserved for future, zeroes
+ * @data: data attached to the inode
+ *
+ * Note, even though inode compression type is defined by @compr_type, some
+ * nodes of this inode may be compressed with different compressor - this
+ * happens if compression type is changed while the inode already has data
+ * nodes. But @compr_type will be use for further writes to the inode.
+ *
+ * Note, do not forget to amend 'zero_ino_node_unused()' function when changing
+ * the padding fields.
+ */
+struct ubifs_ino_node {
+        struct ubifs_ch ch;
+        __u8 key[UBIFS_MAX_KEY_LEN];
+        __le64 creat_sqnum;
+        __le64 size;
+        __le64 atime_sec;
+        __le64 ctime_sec;
+        __le64 mtime_sec;
+        __le32 atime_nsec;
+        __le32 ctime_nsec;
+        __le32 mtime_nsec;
+        __le32 nlink;
+        __le32 uid;
+        __le32 gid;
+        __le32 mode;
+        __le32 flags;
+        __le32 data_len;
+        __le32 xattr_cnt;
+        __le32 xattr_size;
+        __u8 padding1[4]; /* Watch 'zero_ino_node_unused()' if changing! */
+        __le32 xattr_names;
+        __le16 compr_type;
+        __u8 padding2[26]; /* Watch 'zero_ino_node_unused()' if changing! */
+        __u8 data[];
+} __attribute__ ((packed));
+/**
+ * struct ubifs_dent_node - directory entry node.
+ * @ch: common header
+ * @key: node key
+ * @inum: target inode number
+ * @padding1: reserved for future, zeroes
+ * @type: type of the target inode (%UBIFS_ITYPE_REG, %UBIFS_ITYPE_DIR, etc)
+ * @nlen: name length
+ * @padding2: reserved for future, zeroes
+ * @name: zero-terminated name
+ *
+ * Note, do not forget to amend 'zero_dent_node_unused()' function when
+ * changing the padding fields.
+ */
+struct ubifs_dent_node {
+        struct ubifs_ch ch;
+        __u8 key[UBIFS_MAX_KEY_LEN];
+        __le64 inum;
+        __u8 padding1;
+        __u8 type;
+        __le16 nlen;
+        __u8 padding2[4]; /* Watch 'zero_dent_node_unused()' if changing! */
+        __u8 name[];
+} __attribute__ ((packed));
+/**
+ * struct ubifs_data_node - data node.
+ * @ch: common header
+ * @key: node key
+ * @size: uncompressed data size in bytes
+ * @compr_type: compression type (%UBIFS_COMPR_NONE, %UBIFS_COMPR_LZO, etc)
+ * @padding: reserved for future, zeroes
+ * @data: data
+ *
+ * Note, do not forget to amend 'zero_data_node_unused()' function when
+ * changing the padding fields.
+ */
+struct ubifs_data_node {
+        struct ubifs_ch ch;
+        __u8 key[UBIFS_MAX_KEY_LEN];
+        __le32 size;
+        __le16 compr_type;
+        __u8 padding[2]; /* Watch 'zero_data_node_unused()' if changing! */
+        __u8 data[];
+} __attribute__ ((packed));
+/**
+ * struct ubifs_trun_node - truncation node.
+ * @ch: common header
+ * @inum: truncated inode number
+ * @padding: reserved for future, zeroes
+ * @old_size: size before truncation
+ * @new_size: size after truncation
+ *
+ * This node exists only in the journal and never goes to the main area. Note,
+ * do not forget to amend 'zero_trun_node_unused()' function when changing the
+ * padding fields.
+ */
+struct ubifs_trun_node {
+        struct ubifs_ch ch;
+        __le32 inum;
+        __u8 padding[12]; /* Watch 'zero_trun_node_unused()' if changing! */
+        __le64 old_size;
+        __le64 new_size;
+} __attribute__ ((packed));
+/**
+ * struct ubifs_pad_node - padding node.
+ * @ch: common header
+ * @pad_len: how many bytes after this node are unused (because padded)
+ * @padding: reserved for future, zeroes
+ */
+struct ubifs_pad_node {
+        struct ubifs_ch ch;
+        __le32 pad_len;
+} __attribute__ ((packed));
+/**
+ * struct ubifs_sb_node - superblock node.
+ * @ch: common header
+ * @padding: reserved for future, zeroes
+ * @key_hash: type of hash function used in keys
+ * @key_fmt: format of the key
+ * @flags: file-system flags (%UBIFS_FLG_BIGLPT, etc)
+ * @min_io_size: minimal input/output unit size
+ * @leb_size: logical eraseblock size in bytes
+ * @leb_cnt: count of LEBs used by file-system
+ * @max_leb_cnt: maximum count of LEBs used by file-system
+ * @max_bud_bytes: maximum amount of data stored in buds
+ * @log_lebs: log size in logical eraseblocks
+ * @lpt_lebs: number of LEBs used for lprops table
+ * @orph_lebs: number of LEBs used for recording orphans
+ * @jhead_cnt: count of journal heads
+ * @fanout: tree fanout (max. number of links per indexing node)
+ * @lsave_cnt: number of LEB numbers in LPT's save table
+ * @fmt_version: UBIFS on-flash format version
+ * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc)
+ * @padding1: reserved for future, zeroes
+ * @rp_uid: reserve pool UID
+ * @rp_gid: reserve pool GID
+ * @rp_size: size of the reserved pool in bytes
+ * @padding2: reserved for future, zeroes
+ * @time_gran: time granularity in nanoseconds
+ * @uuid: UUID generated when the file system image was created
+ */
+struct ubifs_sb_node {
+        struct ubifs_ch ch;
+        __u8 padding[2];
+        __u8 key_hash;
+        __u8 key_fmt;
+        __le32 flags;
+        __le32 min_io_size;
+        __le32 leb_size;
+        __le32 leb_cnt;
+        __le32 max_leb_cnt;
+        __le64 max_bud_bytes;
+        __le32 log_lebs;
+        __le32 lpt_lebs;
+        __le32 orph_lebs;
+        __le32 jhead_cnt;
+        __le32 fanout;
+        __le32 lsave_cnt;
+        __le32 fmt_version;
+        __le16 default_compr;
+        __u8 padding1[2];
+        __le32 rp_uid;
+        __le32 rp_gid;
+        __le64 rp_size;
+        __le32 time_gran;
+        __u8 uuid[16];
+        __u8 padding2[3972];
+} __attribute__ ((packed));
+/**
+ * struct ubifs_mst_node - master node.
+ * @ch: common header
+ * @highest_inum: highest inode number in the committed index
+ * @cmt_no: commit number
+ * @flags: various flags (%UBIFS_MST_DIRTY, etc)
+ * @log_lnum: start of the log
+ * @root_lnum: LEB number of the root indexing node
+ * @root_offs: offset within @root_lnum
+ * @root_len: root indexing node length
+ * @gc_lnum: LEB reserved for garbage collection (%-1 value means the LEB was
+ * not reserved and should be reserved on mount)
+ * @ihead_lnum: LEB number of index head
+ * @ihead_offs: offset of index head
+ * @index_size: size of index on flash
+ * @total_free: total free space in bytes
+ * @total_dirty: total dirty space in bytes
+ * @total_used: total used space in bytes (includes only data LEBs)
+ * @total_dead: total dead space in bytes (includes only data LEBs)
+ * @total_dark: total dark space in bytes (includes only data LEBs)
+ * @lpt_lnum: LEB number of LPT root nnode
+ * @lpt_offs: offset of LPT root nnode
+ * @nhead_lnum: LEB number of LPT head
+ * @nhead_offs: offset of LPT head
+ * @ltab_lnum: LEB number of LPT's own lprops table
+ * @ltab_offs: offset of LPT's own lprops table
+ * @lsave_lnum: LEB number of LPT's save table (big model only)
+ * @lsave_offs: offset of LPT's save table (big model only)
+ * @lscan_lnum: LEB number of last LPT scan
+ * @empty_lebs: number of empty logical eraseblocks
+ * @idx_lebs: number of indexing logical eraseblocks
+ * @leb_cnt: count of LEBs used by file-system
+ * @padding: reserved for future, zeroes
+ */
+struct ubifs_mst_node {
+        struct ubifs_ch ch;
+        __le64 highest_inum;
+        __le64 cmt_no;
+        __le32 flags;
+        __le32 log_lnum;
+        __le32 root_lnum;
+        __le32 root_offs;
+        __le32 root_len;
+        __le32 gc_lnum;
+        __le32 ihead_lnum;
+        __le32 ihead_offs;
+        __le64 index_size;
+        __le64 total_free;
+        __le64 total_dirty;
+        __le64 total_used;
+        __le64 total_dead;
+        __le64 total_dark;
+        __le32 lpt_lnum;
+        __le32 lpt_offs;
+        __le32 nhead_lnum;
+        __le32 nhead_offs;
+        __le32 ltab_lnum;
+        __le32 ltab_offs;
+        __le32 lsave_lnum;
+        __le32 lsave_offs;
+        __le32 lscan_lnum;
+        __le32 empty_lebs;
+        __le32 idx_lebs;
+        __le32 leb_cnt;
+        __u8 padding[344];
+} __attribute__ ((packed));
+/**
+ * struct ubifs_ref_node - logical eraseblock reference node.
+ * @ch: common header
+ * @lnum: the referred logical eraseblock number
+ * @offs: start offset in the referred LEB
+ * @jhead: journal head number
+ * @padding: reserved for future, zeroes
+ */
+struct ubifs_ref_node {
+        struct ubifs_ch ch;
+        __le32 lnum;
+        __le32 offs;
+        __le32 jhead;
+        __u8 padding[28];
+} __attribute__ ((packed));
+/**
+ * struct ubifs_branch - key/reference/length branch
+ * @lnum: LEB number of the target node
+ * @offs: offset within @lnum
+ * @len: target node length
+ * @key: key
+ */
+struct ubifs_branch {
+        __le32 lnum;
+        __le32 offs;
+        __le32 len;
+        __u8 key[];
+} __attribute__ ((packed));
+/**
+ * struct ubifs_idx_node - indexing node.
+ * @ch: common header
+ * @child_cnt: number of child index nodes
+ * @level: tree level
+ * @branches: LEB number / offset / length / key branches
+ */
+struct ubifs_idx_node {
+        struct ubifs_ch ch;
+        __le16 child_cnt;
+        __le16 level;
+        __u8 branches[];
+} __attribute__ ((packed));
+/**
+ * struct ubifs_cs_node - commit start node.
+ * @ch: common header
+ * @cmt_no: commit number
+ */
+struct ubifs_cs_node {
+        struct ubifs_ch ch;
+        __le64 cmt_no;
+} __attribute__ ((packed));
+/**
+ * struct ubifs_orph_node - orphan node.
+ * @ch: common header
+ * @cmt_no: commit number (also top bit is set on the last node of the commit)
+ * @inos: inode numbers of orphans
+ */
+struct ubifs_orph_node {
+        struct ubifs_ch ch;
+        __le64 cmt_no;
+        __le64 inos[];
+} __attribute__ ((packed));
+#endif /* __UBIFS_MEDIA_H__ */
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
new file mode 100644
index 000000000000..e4f89f271827
--- /dev/null
+++ b/fs/ubifs/ubifs.h
@@ -0,0 +1,1649 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+/* Implementation version 0.7 */
+#ifndef __UBIFS_H__
+#define __UBIFS_H__
+#include <asm/div64.h>
+#include <linux/statfs.h>
+#include <linux/fs.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/rwsem.h>
+#include <linux/mtd/ubi.h>
+#include <linux/pagemap.h>
+#include <linux/backing-dev.h>
+#include "ubifs-media.h"
+/* Version of this UBIFS implementation */
+#define UBIFS_VERSION 1
+/* Normal UBIFS messages */
+#define ubifs_msg(fmt, ...) \
+                printk(KERN_NOTICE "UBIFS: " fmt "\n", ##__VA_ARGS__)
+/* UBIFS error messages */
+#define ubifs_err(fmt, ...)                                                  \
+        printk(KERN_ERR "UBIFS error (pid %d): %s: " fmt "\n", current->pid, \
+               __func__, ##__VA_ARGS__)
+/* UBIFS warning messages */
+#define ubifs_warn(fmt, ...)                                         \
+        printk(KERN_WARNING "UBIFS warning (pid %d): %s: " fmt "\n", \
+               current->pid, __func__, ##__VA_ARGS__)
+/* UBIFS file system VFS magic number */
+#define UBIFS_SUPER_MAGIC 0x24051905
+/* Number of UBIFS blocks per VFS page */
+#define UBIFS_BLOCKS_PER_PAGE (PAGE_CACHE_SIZE / UBIFS_BLOCK_SIZE)
+#define UBIFS_BLOCKS_PER_PAGE_SHIFT (PAGE_CACHE_SHIFT - UBIFS_BLOCK_SHIFT)
+/* "File system end of life" sequence number watermark */
+#define SQNUM_WARN_WATERMARK 0xFFFFFFFF00000000ULL
+#define SQNUM_WATERMARK      0xFFFFFFFFFF000000ULL
+/* Minimum amount of data UBIFS writes to the flash */
+#define MIN_WRITE_SZ (UBIFS_DATA_NODE_SZ + 8)
+/*
+ * Currently we do not support inode number overlapping and re-using, so this
+ * watermark defines dangerous inode number level. This should be fixed later,
+ * although it is difficult to exceed current limit. Another option is to use
+ * 64-bit inode numbers, but this means more overhead.
+ */
+#define INUM_WARN_WATERMARK 0xFFF00000
+#define INUM_WATERMARK      0xFFFFFF00
+/* Largest key size supported in this implementation */
+#define CUR_MAX_KEY_LEN UBIFS_SK_LEN
+/* Maximum number of entries in each LPT (LEB category) heap */
+#define LPT_HEAP_SZ 256
+/*
+ * Background thread name pattern. The numbers are UBI device and volume
+ * numbers.
+ */
+#define BGT_NAME_PATTERN "ubifs_bgt%d_%d"
+/* Default write-buffer synchronization timeout (5 secs) */
+#define DEFAULT_WBUF_TIMEOUT (5 * HZ)
+/* Maximum possible inode number (only 32-bit inodes are supported now) */
+#define MAX_INUM 0xFFFFFFFF
+/* Number of non-data journal heads */
+#define NONDATA_JHEADS_CNT 2
+/* Garbage collector head */
+#define GCHD   0
+/* Base journal head number */
+#define BASEHD 1
+/* First "general purpose" journal head */
+#define DATAHD 2
+/* 'No change' value for 'ubifs_change_lp()' */
+#define LPROPS_NC 0x80000001
+/*
+ * There is no notion of truncation key because truncation nodes do not exist
+ * in TNC. However, when replaying, it is handy to introduce fake "truncation"
+ * keys for truncation nodes because the code becomes simpler. So we define
+ * %UBIFS_TRUN_KEY type.
+ */
+#define UBIFS_TRUN_KEY UBIFS_KEY_TYPES_CNT
+/*
+ * How much a directory entry/extended attribute entry adds to the parent/host
+ * inode.
+ */
+#define CALC_DENT_SIZE(name_len) ALIGN(UBIFS_DENT_NODE_SZ + (name_len) + 1, 8)
+/* How much an extended attribute adds to the host inode */
+#define CALC_XATTR_BYTES(data_len) ALIGN(UBIFS_INO_NODE_SZ + (data_len) + 1, 8)
+/*
+ * Znodes which were not touched for 'OLD_ZNODE_AGE' seconds are considered
+ * "old", and znode which were touched last 'YOUNG_ZNODE_AGE' seconds ago are
+ * considered "young". This is used by shrinker when selecting znode to trim
+ * off.
+ */
+#define OLD_ZNODE_AGE 20
+#define YOUNG_ZNODE_AGE 5
+/*
+ * Some compressors, like LZO, may end up with more data then the input buffer.
+ * So UBIFS always allocates larger output buffer, to be sure the compressor
+ * will not corrupt memory in case of worst case compression.
+ */
+#define WORST_COMPR_FACTOR 2
+/* Maximum expected tree height for use by bottom_up_buf */
+#define BOTTOM_UP_HEIGHT 64
+/*
+ * Lockdep classes for UBIFS inode @ui_mutex.
+ */
+enum {
+        WB_MUTEX_1 = 0,
+        WB_MUTEX_2 = 1,
+        WB_MUTEX_3 = 2,
+};
+/*
+ * Znode flags (actually, bit numbers which store the flags).
+ *
+ * DIRTY_ZNODE: znode is dirty
+ * COW_ZNODE: znode is being committed and a new instance of this znode has to
+ *            be created before changing this znode
+ * OBSOLETE_ZNODE: znode is obsolete, which means it was deleted, but it is
+ *                 still in the commit list and the ongoing commit operation
+ *                 will commit it, and delete this znode after it is done
+ */
+enum {
+        DIRTY_ZNODE    = 0,
+        COW_ZNODE      = 1,
+        OBSOLETE_ZNODE = 2,
+};
+/*
+ * Commit states.
+ *
+ * COMMIT_RESTING: commit is not wanted
+ * COMMIT_BACKGROUND: background commit has been requested
+ * COMMIT_REQUIRED: commit is required
+ * COMMIT_RUNNING_BACKGROUND: background commit is running
+ * COMMIT_RUNNING_REQUIRED: commit is running and it is required
+ * COMMIT_BROKEN: commit failed
+ */
+enum {
+        COMMIT_RESTING = 0,
+        COMMIT_BACKGROUND,
+        COMMIT_REQUIRED,
+        COMMIT_RUNNING_BACKGROUND,
+        COMMIT_RUNNING_REQUIRED,
+        COMMIT_BROKEN,
+};
+/*
+ * 'ubifs_scan_a_node()' return values.
+ *
+ * SCANNED_GARBAGE:  scanned garbage
+ * SCANNED_EMPTY_SPACE: scanned empty space
+ * SCANNED_A_NODE: scanned a valid node
+ * SCANNED_A_CORRUPT_NODE: scanned a corrupted node
+ * SCANNED_A_BAD_PAD_NODE: scanned a padding node with invalid pad length
+ *
+ * Greater than zero means: 'scanned that number of padding bytes'
+ */
+enum {
+        SCANNED_GARBAGE        = 0,
+        SCANNED_EMPTY_SPACE    = -1,
+        SCANNED_A_NODE         = -2,
+        SCANNED_A_CORRUPT_NODE = -3,
+        SCANNED_A_BAD_PAD_NODE = -4,
+};
+/*
+ * LPT cnode flag bits.
+ *
+ * DIRTY_CNODE: cnode is dirty
+ * COW_CNODE: cnode is being committed and must be copied before writing
+ * OBSOLETE_CNODE: cnode is being committed and has been copied (or deleted),
+ * so it can (and must) be freed when the commit is finished
+ */
+enum {
+        DIRTY_CNODE    = 0,
+        COW_CNODE      = 1,
+        OBSOLETE_CNODE = 2,
+};
+/*
+ * Dirty flag bits (lpt_drty_flgs) for LPT special nodes.
+ *
+ * LTAB_DIRTY: ltab node is dirty
+ * LSAVE_DIRTY: lsave node is dirty
+ */
+enum {
+        LTAB_DIRTY  = 1,
+        LSAVE_DIRTY = 2,
+};
+/*
+ * Return codes used by the garbage collector.
+ * @LEB_FREED: the logical eraseblock was freed and is ready to use
+ * @LEB_FREED_IDX: indexing LEB was freed and can be used only after the commit
+ * @LEB_RETAINED: the logical eraseblock was freed and retained for GC purposes
+ */
+enum {
+        LEB_FREED,
+        LEB_FREED_IDX,
+        LEB_RETAINED,
+};
+/**
+ * struct ubifs_old_idx - index node obsoleted since last commit start.
+ * @rb: rb-tree node
+ * @lnum: LEB number of obsoleted index node
+ * @offs: offset of obsoleted index node
+ */
+struct ubifs_old_idx {
+        struct rb_node rb;
+        int lnum;
+        int offs;
+};
+/* The below union makes it easier to deal with keys */
+union ubifs_key {
+        uint8_t u8[CUR_MAX_KEY_LEN];
+        uint32_t u32[CUR_MAX_KEY_LEN/4];
+        uint64_t u64[CUR_MAX_KEY_LEN/8];
+        __le32 j32[CUR_MAX_KEY_LEN/4];
+};
+/**
+ * struct ubifs_scan_node - UBIFS scanned node information.
+ * @list: list of scanned nodes
+ * @key: key of node scanned (if it has one)
+ * @sqnum: sequence number
+ * @type: type of node scanned
+ * @offs: offset with LEB of node scanned
+ * @len: length of node scanned
+ * @node: raw node
+ */
+struct ubifs_scan_node {
+        struct list_head list;
+        union ubifs_key key;
+        unsigned long long sqnum;
+        int type;
+        int offs;
+        int len;
+        void *node;
+};
+/**
+ * struct ubifs_scan_leb - UBIFS scanned LEB information.
+ * @lnum: logical eraseblock number
+ * @nodes_cnt: number of nodes scanned
+ * @nodes: list of struct ubifs_scan_node
+ * @endpt: end point (and therefore the start of empty space)
+ * @ecc: read returned -EBADMSG
+ * @buf: buffer containing entire LEB scanned
+ */
+struct ubifs_scan_leb {
+        int lnum;
+        int nodes_cnt;
+        struct list_head nodes;
+        int endpt;
+        int ecc;
+        void *buf;
+};
+/**
+ * struct ubifs_gced_idx_leb - garbage-collected indexing LEB.
+ * @list: list
+ * @lnum: LEB number
+ * @unmap: OK to unmap this LEB
+ *
+ * This data structure is used to temporary store garbage-collected indexing
+ * LEBs - they are not released immediately, but only after the next commit.
+ * This is needed to guarantee recoverability.
+ */
+struct ubifs_gced_idx_leb {
+        struct list_head list;
+        int lnum;
+        int unmap;
+};
+/**
+ * struct ubifs_inode - UBIFS in-memory inode description.
+ * @vfs_inode: VFS inode description object
+ * @creat_sqnum: sequence number at time of creation
+ * @xattr_size: summarized size of all extended attributes in bytes
+ * @xattr_cnt: count of extended attributes this inode has
+ * @xattr_names: sum of lengths of all extended attribute names belonging to
+ *               this inode
+ * @dirty: non-zero if the inode is dirty
+ * @xattr: non-zero if this is an extended attribute inode
+ * @ui_mutex: serializes inode write-back with the rest of VFS operations,
+ *            serializes "clean <-> dirty" state changes, protects @dirty,
+ *            @ui_size, and @xattr_size
+ * @ui_lock: protects @synced_i_size
+ * @synced_i_size: synchronized size of inode, i.e. the value of inode size
+ *                 currently stored on the flash; used only for regular file
+ *                 inodes
+ * @ui_size: inode size used by UBIFS when writing to flash
+ * @flags: inode flags (@UBIFS_COMPR_FL, etc)
+ * @compr_type: default compression type used for this inode
+ * @data_len: length of the data attached to the inode
+ * @data: inode's data
+ *
+ * @ui_mutex exists for two main reasons. At first it prevents inodes from
+ * being written back while UBIFS changing them, being in the middle of an VFS
+ * operation. This way UBIFS makes sure the inode fields are consistent. For
+ * example, in 'ubifs_rename()' we change 3 inodes simultaneously, and
+ * write-back must not write any of them before we have finished.
+ *
+ * The second reason is budgeting - UBIFS has to budget all operations. If an
+ * operation is going to mark an inode dirty, it has to allocate budget for
+ * this. It cannot just mark it dirty because there is no guarantee there will
+ * be enough flash space to write the inode back later. This means UBIFS has
+ * to have full control over inode "clean <-> dirty" transitions (and pages
+ * actually). But unfortunately, VFS marks inodes dirty in many places, and it
+ * does not ask the file-system if it is allowed to do so (there is a notifier,
+ * but it is not enough), i.e., there is no mechanism to synchronize with this.
+ * So UBIFS has its own inode dirty flag and its own mutex to serialize
+ * "clean <-> dirty" transitions.
+ *
+ * The @synced_i_size field is used to make sure we never write pages which are
+ * beyond last synchronized inode size. See 'ubifs_writepage()' for more
+ * information.
+ *
+ * The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses
+ * @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot
+ * make sure @inode->i_size is always changed under @ui_mutex, because it
+ * cannot call 'vmtruncate()' with @ui_mutex locked, because it would deadlock
+ * with 'ubifs_writepage()' (see file.c). All the other inode fields are
+ * changed under @ui_mutex, so they do not need "shadow" fields. Note, one
+ * could consider to rework locking and base it on "shadow" fields.
+ */
+struct ubifs_inode {
+        struct inode vfs_inode;
+        unsigned long long creat_sqnum;
+        unsigned int xattr_size;
+        unsigned int xattr_cnt;
+        unsigned int xattr_names;
+        unsigned int dirty:1;
+        unsigned int xattr:1;
+        struct mutex ui_mutex;
+        spinlock_t ui_lock;
+        loff_t synced_i_size;
+        loff_t ui_size;
+        int flags;
+        int compr_type;
+        int data_len;
+        void *data;
+};
+/**
+ * struct ubifs_unclean_leb - records a LEB recovered under read-only mode.
+ * @list: list
+ * @lnum: LEB number of recovered LEB
+ * @endpt: offset where recovery ended
+ *
+ * This structure records a LEB identified during recovery that needs to be
+ * cleaned but was not because UBIFS was mounted read-only. The information
+ * is used to clean the LEB when remounting to read-write mode.
+ */
+struct ubifs_unclean_leb {
+        struct list_head list;
+        int lnum;
+        int endpt;
+};
+/*
+ * LEB properties flags.
+ *
+ * LPROPS_UNCAT: not categorized
+ * LPROPS_DIRTY: dirty > 0, not index
+ * LPROPS_DIRTY_IDX: dirty + free > UBIFS_CH_SZ and index
+ * LPROPS_FREE: free > 0, not empty, not index
+ * LPROPS_HEAP_CNT: number of heaps used for storing categorized LEBs
+ * LPROPS_EMPTY: LEB is empty, not taken
+ * LPROPS_FREEABLE: free + dirty == leb_size, not index, not taken
+ * LPROPS_FRDI_IDX: free + dirty == leb_size and index, may be taken
+ * LPROPS_CAT_MASK: mask for the LEB categories above
+ * LPROPS_TAKEN: LEB was taken (this flag is not saved on the media)
+ * LPROPS_INDEX: LEB contains indexing nodes (this flag also exists on flash)
+ */
+enum {
+        LPROPS_UNCAT     =  0,
+        LPROPS_DIRTY     =  1,
+        LPROPS_DIRTY_IDX =  2,
+        LPROPS_FREE      =  3,
+        LPROPS_HEAP_CNT  =  3,
+        LPROPS_EMPTY     =  4,
+        LPROPS_FREEABLE  =  5,
+        LPROPS_FRDI_IDX  =  6,
+        LPROPS_CAT_MASK  = 15,
+        LPROPS_TAKEN     = 16,
+        LPROPS_INDEX     = 32,
+};
+/**
+ * struct ubifs_lprops - logical eraseblock properties.
+ * @free: amount of free space in bytes
+ * @dirty: amount of dirty space in bytes
+ * @flags: LEB properties flags (see above)
+ * @lnum: LEB number
+ * @list: list of same-category lprops (for LPROPS_EMPTY and LPROPS_FREEABLE)
+ * @hpos: heap position in heap of same-category lprops (other categories)
+ */
+struct ubifs_lprops {
+        int free;
+        int dirty;
+        int flags;
+        int lnum;
+        union {
+                struct list_head list;
+                int hpos;
+        };
+};
+/**
+ * struct ubifs_lpt_lprops - LPT logical eraseblock properties.
+ * @free: amount of free space in bytes
+ * @dirty: amount of dirty space in bytes
+ * @tgc: trivial GC flag (1 => unmap after commit end)
+ * @cmt: commit flag (1 => reserved for commit)
+ */
+struct ubifs_lpt_lprops {
+        int free;
+        int dirty;
+        unsigned tgc : 1;
+        unsigned cmt : 1;
+};
+/**
+ * struct ubifs_lp_stats - statistics of eraseblocks in the main area.
+ * @empty_lebs: number of empty LEBs
+ * @taken_empty_lebs: number of taken LEBs
+ * @idx_lebs: number of indexing LEBs
+ * @total_free: total free space in bytes
+ * @total_dirty: total dirty space in bytes
+ * @total_used: total used space in bytes (includes only data LEBs)
+ * @total_dead: total dead space in bytes (includes only data LEBs)
+ * @total_dark: total dark space in bytes (includes only data LEBs)
+ *
+ * N.B. total_dirty and total_used are different to other total_* fields,
+ * because they account _all_ LEBs, not just data LEBs.
+ *
+ * 'taken_empty_lebs' counts the LEBs that are in the transient state of having
+ * been 'taken' for use but not yet written to. 'taken_empty_lebs' is needed
+ * to account correctly for gc_lnum, otherwise 'empty_lebs' could be used
+ * by itself (in which case 'unused_lebs' would be a better name). In the case
+ * of gc_lnum, it is 'taken' at mount time or whenever a LEB is retained by GC,
+ * but unlike other empty LEBs that are 'taken', it may not be written straight
+ * away (i.e. before the next commit start or unmount), so either gc_lnum must
+ * be specially accounted for, or the current approach followed i.e. count it
+ * under 'taken_empty_lebs'.
+ */
+struct ubifs_lp_stats {
+        int empty_lebs;
+        int taken_empty_lebs;
+        int idx_lebs;
+        long long total_free;
+        long long total_dirty;
+        long long total_used;
+        long long total_dead;
+        long long total_dark;
+};
+struct ubifs_nnode;
+/**
+ * struct ubifs_cnode - LEB Properties Tree common node.
+ * @parent: parent nnode
+ * @cnext: next cnode to commit
+ * @flags: flags (%DIRTY_LPT_NODE or %OBSOLETE_LPT_NODE)
+ * @iip: index in parent
+ * @level: level in the tree (zero for pnodes, greater than zero for nnodes)
+ * @num: node number
+ */
+struct ubifs_cnode {
+        struct ubifs_nnode *parent;
+        struct ubifs_cnode *cnext;
+        unsigned long flags;
+        int iip;
+        int level;
+        int num;
+};
+/**
+ * struct ubifs_pnode - LEB Properties Tree leaf node.
+ * @parent: parent nnode
+ * @cnext: next cnode to commit
+ * @flags: flags (%DIRTY_LPT_NODE or %OBSOLETE_LPT_NODE)
+ * @iip: index in parent
+ * @level: level in the tree (always zero for pnodes)
+ * @num: node number
+ * @lprops: LEB properties array
+ */
+struct ubifs_pnode {
+        struct ubifs_nnode *parent;
+        struct ubifs_cnode *cnext;
+        unsigned long flags;
+        int iip;
+        int level;
+        int num;
+        struct ubifs_lprops lprops[UBIFS_LPT_FANOUT];
+};
+/**
+ * struct ubifs_nbranch - LEB Properties Tree internal node branch.
+ * @lnum: LEB number of child
+ * @offs: offset of child
+ * @nnode: nnode child
+ * @pnode: pnode child
+ * @cnode: cnode child
+ */
+struct ubifs_nbranch {
+        int lnum;
+        int offs;
+        union {
+                struct ubifs_nnode *nnode;
+                struct ubifs_pnode *pnode;
+                struct ubifs_cnode *cnode;
+        };
+};
+/**
+ * struct ubifs_nnode - LEB Properties Tree internal node.
+ * @parent: parent nnode
+ * @cnext: next cnode to commit
+ * @flags: flags (%DIRTY_LPT_NODE or %OBSOLETE_LPT_NODE)
+ * @iip: index in parent
+ * @level: level in the tree (always greater than zero for nnodes)
+ * @num: node number
+ * @nbranch: branches to child nodes
+ */
+struct ubifs_nnode {
+        struct ubifs_nnode *parent;
+        struct ubifs_cnode *cnext;
+        unsigned long flags;
+        int iip;
+        int level;
+        int num;
+        struct ubifs_nbranch nbranch[UBIFS_LPT_FANOUT];
+};
+/**
+ * struct ubifs_lpt_heap - heap of categorized lprops.
+ * @arr: heap array
+ * @cnt: number in heap
+ * @max_cnt: maximum number allowed in heap
+ *
+ * There are %LPROPS_HEAP_CNT heaps.
+ */
+struct ubifs_lpt_heap {
+        struct ubifs_lprops **arr;
+        int cnt;
+        int max_cnt;
+};
+/*
+ * Return codes for LPT scan callback function.
+ *
+ * LPT_SCAN_CONTINUE: continue scanning
+ * LPT_SCAN_ADD: add the LEB properties scanned to the tree in memory
+ * LPT_SCAN_STOP: stop scanning
+ */
+enum {
+        LPT_SCAN_CONTINUE = 0,
+        LPT_SCAN_ADD = 1,
+        LPT_SCAN_STOP = 2,
+};
+struct ubifs_info;
+/* Callback used by the 'ubifs_lpt_scan_nolock()' function */
+typedef int (*ubifs_lpt_scan_callback)(struct ubifs_info *c,
+                                       const struct ubifs_lprops *lprops,
+                                       int in_tree, void *data);
+/**
+ * struct ubifs_wbuf - UBIFS write-buffer.
+ * @c: UBIFS file-system description object
+ * @buf: write-buffer (of min. flash I/O unit size)
+ * @lnum: logical eraseblock number the write-buffer points to
+ * @offs: write-buffer offset in this logical eraseblock
+ * @avail: number of bytes available in the write-buffer
+ * @used:  number of used bytes in the write-buffer
+ * @dtype: type of data stored in this LEB (%UBI_LONGTERM, %UBI_SHORTTERM,
+ * %UBI_UNKNOWN)
+ * @jhead: journal head the mutex belongs to (note, needed only to shut lockdep
+ *         up by 'mutex_lock_nested()).
+ * @sync_callback: write-buffer synchronization callback
+ * @io_mutex: serializes write-buffer I/O
+ * @lock: serializes @buf, @lnum, @offs, @avail, @used, @next_ino and @inodes
+ *        fields
+ * @timer: write-buffer timer
+ * @timeout: timer expire interval in jiffies
+ * @need_sync: it is set if its timer expired and needs sync
+ * @next_ino: points to the next position of the following inode number
+ * @inodes: stores the inode numbers of the nodes which are in wbuf
+ *
+ * The write-buffer synchronization callback is called when the write-buffer is
+ * synchronized in order to notify how much space was wasted due to
+ * write-buffer padding and how much free space is left in the LEB.
+ *
+ * Note: the fields @buf, @lnum, @offs, @avail and @used can be read under
+ * spin-lock or mutex because they are written under both mutex and spin-lock.
+ * @buf is appended to under mutex but overwritten under both mutex and
+ * spin-lock. Thus the data between @buf and @buf + @used can be read under
+ * spinlock.
+ */
+struct ubifs_wbuf {
+        struct ubifs_info *c;
+        void *buf;
+        int lnum;
+        int offs;
+        int avail;
+        int used;
+        int dtype;
+        int jhead;
+        int (*sync_callback)(struct ubifs_info *c, int lnum, int free, int pad);
+        struct mutex io_mutex;
+        spinlock_t lock;
+        struct timer_list timer;
+        int timeout;
+        int need_sync;
+        int next_ino;
+        ino_t *inodes;
+};
+/**
+ * struct ubifs_bud - bud logical eraseblock.
+ * @lnum: logical eraseblock number
+ * @start: where the (uncommitted) bud data starts
+ * @jhead: journal head number this bud belongs to
+ * @list: link in the list buds belonging to the same journal head
+ * @rb: link in the tree of all buds
+ */
+struct ubifs_bud {
+        int lnum;
+        int start;
+        int jhead;
+        struct list_head list;
+        struct rb_node rb;
+};
+/**
+ * struct ubifs_jhead - journal head.
+ * @wbuf: head's write-buffer
+ * @buds_list: list of bud LEBs belonging to this journal head
+ *
+ * Note, the @buds list is protected by the @c->buds_lock.
+ */
+struct ubifs_jhead {
+        struct ubifs_wbuf wbuf;
+        struct list_head buds_list;
+};
+/**
+ * struct ubifs_zbranch - key/coordinate/length branch stored in znodes.
+ * @key: key
+ * @znode: znode address in memory
+ * @lnum: LEB number of the indexing node
+ * @offs: offset of the indexing node within @lnum
+ * @len: target node length
+ */
+struct ubifs_zbranch {
+        union ubifs_key key;
+        union {
+                struct ubifs_znode *znode;
+                void *leaf;
+        };
+        int lnum;
+        int offs;
+        int len;
+};
+/**
+ * struct ubifs_znode - in-memory representation of an indexing node.
+ * @parent: parent znode or NULL if it is the root
+ * @cnext: next znode to commit
+ * @flags: znode flags (%DIRTY_ZNODE, %COW_ZNODE or %OBSOLETE_ZNODE)
+ * @time: last access time (seconds)
+ * @level: level of the entry in the TNC tree
+ * @child_cnt: count of child znodes
+ * @iip: index in parent's zbranch array
+ * @alt: lower bound of key range has altered i.e. child inserted at slot 0
+ * @lnum: LEB number of the corresponding indexing node
+ * @offs: offset of the corresponding indexing node
+ * @len: length  of the corresponding indexing node
+ * @zbranch: array of znode branches (@c->fanout elements)
+ */
+struct ubifs_znode {
+        struct ubifs_znode *parent;
+        struct ubifs_znode *cnext;
+        unsigned long flags;
+        unsigned long time;
+        int level;
+        int child_cnt;
+        int iip;
+        int alt;
+#ifdef CONFIG_UBIFS_FS_DEBUG
+        int lnum, offs, len;
+#endif
+        struct ubifs_zbranch zbranch[];
+};
+/**
+ * struct ubifs_node_range - node length range description data structure.
+ * @len: fixed node length
+ * @min_len: minimum possible node length
+ * @max_len: maximum possible node length
+ *
+ * If @max_len is %0, the node has fixed length @len.
+ */
+struct ubifs_node_range {
+        union {
+                int len;
+                int min_len;
+        };
+        int max_len;
+};
+/**
+ * struct ubifs_compressor - UBIFS compressor description structure.
+ * @compr_type: compressor type (%UBIFS_COMPR_LZO, etc)
+ * @cc: cryptoapi compressor handle
+ * @comp_mutex: mutex used during compression
+ * @decomp_mutex: mutex used during decompression
+ * @name: compressor name
+ * @capi_name: cryptoapi compressor name
+ */
+struct ubifs_compressor {
+        int compr_type;
+        struct crypto_comp *cc;
+        struct mutex *comp_mutex;
+        struct mutex *decomp_mutex;
+        const char *name;
+        const char *capi_name;
+};
+/**
+ * struct ubifs_budget_req - budget requirements of an operation.
+ *
+ * @fast: non-zero if the budgeting should try to aquire budget quickly and
+ *        should not try to call write-back
+ * @recalculate: non-zero if @idx_growth, @data_growth, and @dd_growth fields
+ *               have to be re-calculated
+ * @new_page: non-zero if the operation adds a new page
+ * @dirtied_page: non-zero if the operation makes a page dirty
+ * @new_dent: non-zero if the operation adds a new directory entry
+ * @mod_dent: non-zero if the operation removes or modifies an existing
+ *            directory entry
+ * @new_ino: non-zero if the operation adds a new inode
+ * @new_ino_d: now much data newly created inode contains
+ * @dirtied_ino: how many inodes the operation makes dirty
+ * @dirtied_ino_d: now much data dirtied inode contains
+ * @idx_growth: how much the index will supposedly grow
+ * @data_growth: how much new data the operation will supposedly add
+ * @dd_growth: how much data that makes other data dirty the operation will
+ *             supposedly add
+ *
+ * @idx_growth, @data_growth and @dd_growth are not used in budget request. The
+ * budgeting subsystem caches index and data growth values there to avoid
+ * re-calculating them when the budget is released. However, if @idx_growth is
+ * %-1, it is calculated by the release function using other fields.
+ *
+ * An inode may contain 4KiB of data at max., thus the widths of @new_ino_d
+ * is 13 bits, and @dirtied_ino_d - 15, because up to 4 inodes may be made
+ * dirty by the re-name operation.
+ */
+struct ubifs_budget_req {
+        unsigned int fast:1;
+        unsigned int recalculate:1;
+        unsigned int new_page:1;
+        unsigned int dirtied_page:1;
+        unsigned int new_dent:1;
+        unsigned int mod_dent:1;
+        unsigned int new_ino:1;
+        unsigned int new_ino_d:13;
+#ifndef UBIFS_DEBUG
+        unsigned int dirtied_ino:4;
+        unsigned int dirtied_ino_d:15;
+#else
+        /* Not bit-fields to check for overflows */
+        unsigned int dirtied_ino;
+        unsigned int dirtied_ino_d;
+#endif
+        int idx_growth;
+        int data_growth;
+        int dd_growth;
+};
+/**
+ * struct ubifs_orphan - stores the inode number of an orphan.
+ * @rb: rb-tree node of rb-tree of orphans sorted by inode number
+ * @list: list head of list of orphans in order added
+ * @new_list: list head of list of orphans added since the last commit
+ * @cnext: next orphan to commit
+ * @dnext: next orphan to delete
+ * @inum: inode number
+ * @new: %1 => added since the last commit, otherwise %0
+ */
+struct ubifs_orphan {
+        struct rb_node rb;
+        struct list_head list;
+        struct list_head new_list;
+        struct ubifs_orphan *cnext;
+        struct ubifs_orphan *dnext;
+        ino_t inum;
+        int new;
+};
+/**
+ * struct ubifs_mount_opts - UBIFS-specific mount options information.
+ * @unmount_mode: selected unmount mode (%0 default, %1 normal, %2 fast)
+ */
+struct ubifs_mount_opts {
+        unsigned int unmount_mode:2;
+};
+/**
+ * struct ubifs_info - UBIFS file-system description data structure
+ * (per-superblock).
+ * @vfs_sb: VFS @struct super_block object
+ * @bdi: backing device info object to make VFS happy and disable readahead
+ *
+ * @highest_inum: highest used inode number
+ * @vfs_gen: VFS inode generation counter
+ * @max_sqnum: current global sequence number
+ * @cmt_no: commit number (last successfully completed commit)
+ * @cnt_lock: protects @highest_inum, @vfs_gen, and @max_sqnum counters
+ * @fmt_version: UBIFS on-flash format version
+ * @uuid: UUID from super block
+ *
+ * @lhead_lnum: log head logical eraseblock number
+ * @lhead_offs: log head offset
+ * @ltail_lnum: log tail logical eraseblock number (offset is always 0)
+ * @log_mutex: protects the log, @lhead_lnum, @lhead_offs, @ltail_lnum, and
+ *             @bud_bytes
+ * @min_log_bytes: minimum required number of bytes in the log
+ * @cmt_bud_bytes: used during commit to temporarily amount of bytes in
+ *                 committed buds
+ *
+ * @buds: tree of all buds indexed by bud LEB number
+ * @bud_bytes: how many bytes of flash is used by buds
+ * @buds_lock: protects the @buds tree, @bud_bytes, and per-journal head bud
+ *             lists
+ * @jhead_cnt: count of journal heads
+ * @jheads: journal heads (head zero is base head)
+ * @max_bud_bytes: maximum number of bytes allowed in buds
+ * @bg_bud_bytes: number of bud bytes when background commit is initiated
+ * @old_buds: buds to be released after commit ends
+ * @max_bud_cnt: maximum number of buds
+ *
+ * @commit_sem: synchronizes committer with other processes
+ * @cmt_state: commit state
+ * @cs_lock: commit state lock
+ * @cmt_wq: wait queue to sleep on if the log is full and a commit is running
+ * @fast_unmount: do not run journal commit before un-mounting
+ * @big_lpt: flag that LPT is too big to write whole during commit
+ * @check_lpt_free: flag that indicates LPT GC may be needed
+ * @nospace: non-zero if the file-system does not have flash space (used as
+ *           optimization)
+ * @nospace_rp: the same as @nospace, but additionally means that even reserved
+ *              pool is full
+ *
+ * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and
+ *             @calc_idx_sz
+ * @zroot: zbranch which points to the root index node and znode
+ * @cnext: next znode to commit
+ * @enext: next znode to commit to empty space
+ * @gap_lebs: array of LEBs used by the in-gaps commit method
+ * @cbuf: commit buffer
+ * @ileb_buf: buffer for commit in-the-gaps method
+ * @ileb_len: length of data in ileb_buf
+ * @ihead_lnum: LEB number of index head
+ * @ihead_offs: offset of index head
+ * @ilebs: pre-allocated index LEBs
+ * @ileb_cnt: number of pre-allocated index LEBs
+ * @ileb_nxt: next pre-allocated index LEBs
+ * @old_idx: tree of index nodes obsoleted since the last commit start
+ * @bottom_up_buf: a buffer which is used by 'dirty_cow_bottom_up()' in tnc.c
+ * @new_ihead_lnum: used by debugging to check ihead_lnum
+ * @new_ihead_offs: used by debugging to check ihead_offs
+ *
+ * @mst_node: master node
+ * @mst_offs: offset of valid master node
+ * @mst_mutex: protects the master node area, @mst_node, and @mst_offs
+ *
+ * @log_lebs: number of logical eraseblocks in the log
+ * @log_bytes: log size in bytes
+ * @log_last: last LEB of the log
+ * @lpt_lebs: number of LEBs used for lprops table
+ * @lpt_first: first LEB of the lprops table area
+ * @lpt_last: last LEB of the lprops table area
+ * @orph_lebs: number of LEBs used for the orphan area
+ * @orph_first: first LEB of the orphan area
+ * @orph_last: last LEB of the orphan area
+ * @main_lebs: count of LEBs in the main area
+ * @main_first: first LEB of the main area
+ * @main_bytes: main area size in bytes
+ * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc)
+ *
+ * @key_hash_type: type of the key hash
+ * @key_hash: direntry key hash function
+ * @key_fmt: key format
+ * @key_len: key length
+ * @fanout: fanout of the index tree (number of links per indexing node)
+ *
+ * @min_io_size: minimal input/output unit size
+ * @min_io_shift: number of bits in @min_io_size minus one
+ * @leb_size: logical eraseblock size in bytes
+ * @half_leb_size: half LEB size
+ * @leb_cnt: count of logical eraseblocks
+ * @max_leb_cnt: maximum count of logical eraseblocks
+ * @old_leb_cnt: count of logical eraseblocks before re-size
+ * @ro_media: the underlying UBI volume is read-only
+ *
+ * @dirty_pg_cnt: number of dirty pages (not used)
+ * @dirty_zn_cnt: number of dirty znodes
+ * @clean_zn_cnt: number of clean znodes
+ *
+ * @budg_idx_growth: amount of bytes budgeted for index growth
+ * @budg_data_growth: amount of bytes budgeted for cached data
+ * @budg_dd_growth: amount of bytes budgeted for cached data that will make
+ *                  other data dirty
+ * @budg_uncommitted_idx: amount of bytes were budgeted for growth of the index,
+ *                        but which still have to be taken into account because
+ *                        the index has not been committed so far
+ * @space_lock: protects @budg_idx_growth, @budg_data_growth, @budg_dd_growth,
+ *              @budg_uncommited_idx, @min_idx_lebs, @old_idx_sz, and @lst;
+ * @min_idx_lebs: minimum number of LEBs required for the index
+ * @old_idx_sz: size of index on flash
+ * @calc_idx_sz: temporary variable which is used to calculate new index size
+ *               (contains accurate new index size at end of TNC commit start)
+ * @lst: lprops statistics
+ *
+ * @page_budget: budget for a page
+ * @inode_budget: budget for an inode
+ * @dent_budget: budget for a directory entry
+ *
+ * @ref_node_alsz: size of the LEB reference node aligned to the min. flash
+ * I/O unit
+ * @mst_node_alsz: master node aligned size
+ * @min_idx_node_sz: minimum indexing node aligned on 8-bytes boundary
+ * @max_idx_node_sz: maximum indexing node aligned on 8-bytes boundary
+ * @max_inode_sz: maximum possible inode size in bytes
+ * @max_znode_sz: size of znode in bytes
+ * @dead_wm: LEB dead space watermark
+ * @dark_wm: LEB dark space watermark
+ * @block_cnt: count of 4KiB blocks on the FS
+ *
+ * @ranges: UBIFS node length ranges
+ * @ubi: UBI volume descriptor
+ * @di: UBI device information
+ * @vi: UBI volume information
+ *
+ * @orph_tree: rb-tree of orphan inode numbers
+ * @orph_list: list of orphan inode numbers in order added
+ * @orph_new: list of orphan inode numbers added since last commit
+ * @orph_cnext: next orphan to commit
+ * @orph_dnext: next orphan to delete
+ * @orphan_lock: lock for orph_tree and orph_new
+ * @orph_buf: buffer for orphan nodes
+ * @new_orphans: number of orphans since last commit
+ * @cmt_orphans: number of orphans being committed
+ * @tot_orphans: number of orphans in the rb_tree
+ * @max_orphans: maximum number of orphans allowed
+ * @ohead_lnum: orphan head LEB number
+ * @ohead_offs: orphan head offset
+ * @no_orphs: non-zero if there are no orphans
+ *
+ * @bgt: UBIFS background thread
+ * @bgt_name: background thread name
+ * @need_bgt: if background thread should run
+ * @need_wbuf_sync: if write-buffers have to be synchronized
+ *
+ * @gc_lnum: LEB number used for garbage collection
+ * @sbuf: a buffer of LEB size used by GC and replay for scanning
+ * @idx_gc: list of index LEBs that have been garbage collected
+ * @idx_gc_cnt: number of elements on the idx_gc list
+ *
+ * @infos_list: links all 'ubifs_info' objects
+ * @umount_mutex: serializes shrinker and un-mount
+ * @shrinker_run_no: shrinker run number
+ *
+ * @space_bits: number of bits needed to record free or dirty space
+ * @lpt_lnum_bits: number of bits needed to record a LEB number in the LPT
+ * @lpt_offs_bits: number of bits needed to record an offset in the LPT
+ * @lpt_spc_bits: number of bits needed to space in the LPT
+ * @pcnt_bits: number of bits needed to record pnode or nnode number
+ * @lnum_bits: number of bits needed to record LEB number
+ * @nnode_sz: size of on-flash nnode
+ * @pnode_sz: size of on-flash pnode
+ * @ltab_sz: size of on-flash LPT lprops table
+ * @lsave_sz: size of on-flash LPT save table
+ * @pnode_cnt: number of pnodes
+ * @nnode_cnt: number of nnodes
+ * @lpt_hght: height of the LPT
+ * @pnodes_have: number of pnodes in memory
+ *
+ * @lp_mutex: protects lprops table and all the other lprops-related fields
+ * @lpt_lnum: LEB number of the root nnode of the LPT
+ * @lpt_offs: offset of the root nnode of the LPT
+ * @nhead_lnum: LEB number of LPT head
+ * @nhead_offs: offset of LPT head
+ * @lpt_drty_flgs: dirty flags for LPT special nodes e.g. ltab
+ * @dirty_nn_cnt: number of dirty nnodes
+ * @dirty_pn_cnt: number of dirty pnodes
+ * @lpt_sz: LPT size
+ * @lpt_nod_buf: buffer for an on-flash nnode or pnode
+ * @lpt_buf: buffer of LEB size used by LPT
+ * @nroot: address in memory of the root nnode of the LPT
+ * @lpt_cnext: next LPT node to commit
+ * @lpt_heap: array of heaps of categorized lprops
+ * @dirty_idx: a (reverse sorted) copy of the LPROPS_DIRTY_IDX heap as at
+ *             previous commit start
+ * @uncat_list: list of un-categorized LEBs
+ * @empty_list: list of empty LEBs
+ * @freeable_list: list of freeable non-index LEBs (free + dirty == leb_size)
+ * @frdi_idx_list: list of freeable index LEBs (free + dirty == leb_size)
+ * @freeable_cnt: number of freeable LEBs in @freeable_list
+ *
+ * @ltab_lnum: LEB number of LPT's own lprops table
+ * @ltab_offs: offset of LPT's own lprops table
+ * @ltab: LPT's own lprops table
+ * @ltab_cmt: LPT's own lprops table (commit copy)
+ * @lsave_cnt: number of LEB numbers in LPT's save table
+ * @lsave_lnum: LEB number of LPT's save table
+ * @lsave_offs: offset of LPT's save table
+ * @lsave: LPT's save table
+ * @lscan_lnum: LEB number of last LPT scan
+ *
+ * @rp_size: size of the reserved pool in bytes
+ * @report_rp_size: size of the reserved pool reported to user-space
+ * @rp_uid: reserved pool user ID
+ * @rp_gid: reserved pool group ID
+ *
+ * @empty: if the UBI device is empty
+ * @replay_tree: temporary tree used during journal replay
+ * @replay_list: temporary list used during journal replay
+ * @replay_buds: list of buds to replay
+ * @cs_sqnum: sequence number of first node in the log (commit start node)
+ * @replay_sqnum: sequence number of node currently being replayed
+ * @need_recovery: file-system needs recovery
+ * @replaying: set to %1 during journal replay
+ * @unclean_leb_list: LEBs to recover when mounting ro to rw
+ * @rcvrd_mst_node: recovered master node to write when mounting ro to rw
+ * @size_tree: inode size information for recovery
+ * @remounting_rw: set while remounting from ro to rw (sb flags have MS_RDONLY)
+ * @mount_opts: UBIFS-specific mount options
+ *
+ * @dbg_buf: a buffer of LEB size used for debugging purposes
+ * @old_zroot: old index root - used by 'dbg_check_old_index()'
+ * @old_zroot_level: old index root level - used by 'dbg_check_old_index()'
+ * @old_zroot_sqnum: old index root sqnum - used by 'dbg_check_old_index()'
+ * @failure_mode: failure mode for recovery testing
+ * @fail_delay: 0=>don't delay, 1=>delay a time, 2=>delay a number of calls
+ * @fail_timeout: time in jiffies when delay of failure mode expires
+ * @fail_cnt: current number of calls to failure mode I/O functions
+ * @fail_cnt_max: number of calls by which to delay failure mode
+ */
+struct ubifs_info {
+        struct super_block *vfs_sb;
+        struct backing_dev_info bdi;
+        ino_t highest_inum;
+        unsigned int vfs_gen;
+        unsigned long long max_sqnum;
+        unsigned long long cmt_no;
+        spinlock_t cnt_lock;
+        int fmt_version;
+        unsigned char uuid[16];
+        int lhead_lnum;
+        int lhead_offs;
+        int ltail_lnum;
+        struct mutex log_mutex;
+        int min_log_bytes;
+        long long cmt_bud_bytes;
+        struct rb_root buds;
+        long long bud_bytes;
+        spinlock_t buds_lock;
+        int jhead_cnt;
+        struct ubifs_jhead *jheads;
+        long long max_bud_bytes;
+        long long bg_bud_bytes;
+        struct list_head old_buds;
+        int max_bud_cnt;
+        struct rw_semaphore commit_sem;
+        int cmt_state;
+        spinlock_t cs_lock;
+        wait_queue_head_t cmt_wq;
+        unsigned int fast_unmount:1;
+        unsigned int big_lpt:1;
+        unsigned int check_lpt_free:1;
+        unsigned int nospace:1;
+        unsigned int nospace_rp:1;
+        struct mutex tnc_mutex;
+        struct ubifs_zbranch zroot;
+        struct ubifs_znode *cnext;
+        struct ubifs_znode *enext;
+        int *gap_lebs;
+        void *cbuf;
+        void *ileb_buf;
+        int ileb_len;
+        int ihead_lnum;
+        int ihead_offs;
+        int *ilebs;
+        int ileb_cnt;
+        int ileb_nxt;
+        struct rb_root old_idx;
+        int *bottom_up_buf;
+#ifdef CONFIG_UBIFS_FS_DEBUG
+        int new_ihead_lnum;
+        int new_ihead_offs;
+#endif
+        struct ubifs_mst_node *mst_node;
+        int mst_offs;
+        struct mutex mst_mutex;
+        int log_lebs;
+        long long log_bytes;
+        int log_last;
+        int lpt_lebs;
+        int lpt_first;
+        int lpt_last;
+        int orph_lebs;
+        int orph_first;
+        int orph_last;
+        int main_lebs;
+        int main_first;
+        long long main_bytes;
+        int default_compr;
+        uint8_t key_hash_type;
+        uint32_t (*key_hash)(const char *str, int len);
+        int key_fmt;
+        int key_len;
+        int fanout;
+        int min_io_size;
+        int min_io_shift;
+        int leb_size;
+        int half_leb_size;
+        int leb_cnt;
+        int max_leb_cnt;
+        int old_leb_cnt;
+        int ro_media;
+        atomic_long_t dirty_pg_cnt;
+        atomic_long_t dirty_zn_cnt;
+        atomic_long_t clean_zn_cnt;
+        long long budg_idx_growth;
+        long long budg_data_growth;
+        long long budg_dd_growth;
+        long long budg_uncommitted_idx;
+        spinlock_t space_lock;
+        int min_idx_lebs;
+        unsigned long long old_idx_sz;
+        unsigned long long calc_idx_sz;
+        struct ubifs_lp_stats lst;
+        int page_budget;
+        int inode_budget;
+        int dent_budget;
+        int ref_node_alsz;
+        int mst_node_alsz;
+        int min_idx_node_sz;
+        int max_idx_node_sz;
+        long long max_inode_sz;
+        int max_znode_sz;
+        int dead_wm;
+        int dark_wm;
+        int block_cnt;
+        struct ubifs_node_range ranges[UBIFS_NODE_TYPES_CNT];
+        struct ubi_volume_desc *ubi;
+        struct ubi_device_info di;
+        struct ubi_volume_info vi;
+        struct rb_root orph_tree;
+        struct list_head orph_list;
+        struct list_head orph_new;
+        struct ubifs_orphan *orph_cnext;
+        struct ubifs_orphan *orph_dnext;
+        spinlock_t orphan_lock;
+        void *orph_buf;
+        int new_orphans;
+        int cmt_orphans;
+        int tot_orphans;
+        int max_orphans;
+        int ohead_lnum;
+        int ohead_offs;
+        int no_orphs;
+        struct task_struct *bgt;
+        char bgt_name[sizeof(BGT_NAME_PATTERN) + 9];
+        int need_bgt;
+        int need_wbuf_sync;
+        int gc_lnum;
+        void *sbuf;
+        struct list_head idx_gc;
+        int idx_gc_cnt;
+        struct list_head infos_list;
+        struct mutex umount_mutex;
+        unsigned int shrinker_run_no;
+        int space_bits;
+        int lpt_lnum_bits;
+        int lpt_offs_bits;
+        int lpt_spc_bits;
+        int pcnt_bits;
+        int lnum_bits;
+        int nnode_sz;
+        int pnode_sz;
+        int ltab_sz;
+        int lsave_sz;
+        int pnode_cnt;
+        int nnode_cnt;
+        int lpt_hght;
+        int pnodes_have;
+        struct mutex lp_mutex;
+        int lpt_lnum;
+        int lpt_offs;
+        int nhead_lnum;
+        int nhead_offs;
+        int lpt_drty_flgs;
+        int dirty_nn_cnt;
+        int dirty_pn_cnt;
+        long long lpt_sz;
+        void *lpt_nod_buf;
+        void *lpt_buf;
+        struct ubifs_nnode *nroot;
+        struct ubifs_cnode *lpt_cnext;
+        struct ubifs_lpt_heap lpt_heap[LPROPS_HEAP_CNT];
+        struct ubifs_lpt_heap dirty_idx;
+        struct list_head uncat_list;
+        struct list_head empty_list;
+        struct list_head freeable_list;
+        struct list_head frdi_idx_list;
+        int freeable_cnt;
+        int ltab_lnum;
+        int ltab_offs;
+        struct ubifs_lpt_lprops *ltab;
+        struct ubifs_lpt_lprops *ltab_cmt;
+        int lsave_cnt;
+        int lsave_lnum;
+        int lsave_offs;
+        int *lsave;
+        int lscan_lnum;
+        long long rp_size;
+        long long report_rp_size;
+        uid_t rp_uid;
+        gid_t rp_gid;
+        /* The below fields are used only during mounting and re-mounting */
+        int empty;
+        struct rb_root replay_tree;
+        struct list_head replay_list;
+        struct list_head replay_buds;
+        unsigned long long cs_sqnum;
+        unsigned long long replay_sqnum;
+        int need_recovery;
+        int replaying;
+        struct list_head unclean_leb_list;
+        struct ubifs_mst_node *rcvrd_mst_node;
+        struct rb_root size_tree;
+        int remounting_rw;
+        struct ubifs_mount_opts mount_opts;
+#ifdef CONFIG_UBIFS_FS_DEBUG
+        void *dbg_buf;
+        struct ubifs_zbranch old_zroot;
+        int old_zroot_level;
+        unsigned long long old_zroot_sqnum;
+        int failure_mode;
+        int fail_delay;
+        unsigned long fail_timeout;
+        unsigned int fail_cnt;
+        unsigned int fail_cnt_max;
+#endif
+};
+extern struct list_head ubifs_infos;
+extern spinlock_t ubifs_infos_lock;
+extern atomic_long_t ubifs_clean_zn_cnt;
+extern struct kmem_cache *ubifs_inode_slab;
+extern struct super_operations ubifs_super_operations;
+extern struct address_space_operations ubifs_file_address_operations;
+extern struct file_operations ubifs_file_operations;
+extern struct inode_operations ubifs_file_inode_operations;
+extern struct file_operations ubifs_dir_operations;
+extern struct inode_operations ubifs_dir_inode_operations;
+extern struct inode_operations ubifs_symlink_inode_operations;
+extern struct backing_dev_info ubifs_backing_dev_info;
+extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
+/* io.c */
+int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len);
+int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
+                           int dtype);
+int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf);
+int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len,
+                    int lnum, int offs);
+int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
+                         int lnum, int offs);
+int ubifs_write_node(struct ubifs_info *c, void *node, int len, int lnum,
+                     int offs, int dtype);
+int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
+                     int offs, int quiet);
+void ubifs_prepare_node(struct ubifs_info *c, void *buf, int len, int pad);
+void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last);
+int ubifs_io_init(struct ubifs_info *c);
+void ubifs_pad(const struct ubifs_info *c, void *buf, int pad);
+int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf);
+int ubifs_bg_wbufs_sync(struct ubifs_info *c);
+void ubifs_wbuf_add_ino_nolock(struct ubifs_wbuf *wbuf, ino_t inum);
+int ubifs_sync_wbufs_by_inode(struct ubifs_info *c, struct inode *inode);
+/* scan.c */
+struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
+                                  int offs, void *sbuf);
+void ubifs_scan_destroy(struct ubifs_scan_leb *sleb);
+int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum,
+                      int offs, int quiet);
+struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum,
+                                        int offs, void *sbuf);
+void ubifs_end_scan(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+                    int lnum, int offs);
+int ubifs_add_snod(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+                   void *buf, int offs);
+void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs,
+                              void *buf);
+/* log.c */
+void ubifs_add_bud(struct ubifs_info *c, struct ubifs_bud *bud);
+void ubifs_create_buds_lists(struct ubifs_info *c);
+int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs);
+struct ubifs_bud *ubifs_search_bud(struct ubifs_info *c, int lnum);
+struct ubifs_wbuf *ubifs_get_wbuf(struct ubifs_info *c, int lnum);
+int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum);
+int ubifs_log_end_commit(struct ubifs_info *c, int new_ltail_lnum);
+int ubifs_log_post_commit(struct ubifs_info *c, int old_ltail_lnum);
+int ubifs_consolidate_log(struct ubifs_info *c);
+/* journal.c */
+int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
+                     const struct qstr *nm, const struct inode *inode,
+                     int deletion, int xent);
+int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
+                         const union ubifs_key *key, const void *buf, int len);
+int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode,
+                          int last_reference);
+int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
+                     const struct dentry *old_dentry,
+                     const struct inode *new_dir,
+                     const struct dentry *new_dentry, int sync);
+int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
+                       loff_t old_size, loff_t new_size);
+int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host,
+                           const struct inode *inode, const struct qstr *nm);
+int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode1,
+                           const struct inode *inode2);
+/* budget.c */
+int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req);
+void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req);
+void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
+                                      struct ubifs_inode *ui);
+int ubifs_budget_inode_op(struct ubifs_info *c, struct inode *inode,
+                          struct ubifs_budget_req *req);
+void ubifs_release_ino_dirty(struct ubifs_info *c, struct inode *inode,
+                                struct ubifs_budget_req *req);
+void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode,
+                         struct ubifs_budget_req *req);
+long long ubifs_budg_get_free_space(struct ubifs_info *c);
+int ubifs_calc_min_idx_lebs(struct ubifs_info *c);
+void ubifs_convert_page_budget(struct ubifs_info *c);
+long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs);
+/* find.c */
+int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
+                          int squeeze);
+int ubifs_find_free_leb_for_idx(struct ubifs_info *c);
+int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
+                         int min_space, int pick_free);
+int ubifs_find_dirty_idx_leb(struct ubifs_info *c);
+int ubifs_save_dirty_idx_lnums(struct ubifs_info *c);
+/* tnc.c */
+int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
+                        struct ubifs_znode **zn, int *n);
+int ubifs_tnc_lookup(struct ubifs_info *c, const union ubifs_key *key,
+                     void *node);
+int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
+                        void *node, const struct qstr *nm);
+int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
+                     void *node, int *lnum, int *offs);
+int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum,
+                  int offs, int len);
+int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key,
+                      int old_lnum, int old_offs, int lnum, int offs, int len);
+int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
+                     int lnum, int offs, int len, const struct qstr *nm);
+int ubifs_tnc_remove(struct ubifs_info *c, const union ubifs_key *key);
+int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key,
+                        const struct qstr *nm);
+int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key,
+                           union ubifs_key *to_key);
+int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum);
+struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c,
+                                           union ubifs_key *key,
+                                           const struct qstr *nm);
+void ubifs_tnc_close(struct ubifs_info *c);
+int ubifs_tnc_has_node(struct ubifs_info *c, union ubifs_key *key, int level,
+                       int lnum, int offs, int is_idx);
+int ubifs_dirty_idx_node(struct ubifs_info *c, union ubifs_key *key, int level,
+                         int lnum, int offs);
+/* Shared by tnc.c for tnc_commit.c */
+void destroy_old_idx(struct ubifs_info *c);
+int is_idx_node_in_tnc(struct ubifs_info *c, union ubifs_key *key, int level,
+                       int lnum, int offs);
+int insert_old_idx_znode(struct ubifs_info *c, struct ubifs_znode *znode);
+/* tnc_misc.c */
+struct ubifs_znode *ubifs_tnc_levelorder_next(struct ubifs_znode *zr,
+                                              struct ubifs_znode *znode);
+int ubifs_search_zbranch(const struct ubifs_info *c,
+                         const struct ubifs_znode *znode,
+                         const union ubifs_key *key, int *n);
+struct ubifs_znode *ubifs_tnc_postorder_first(struct ubifs_znode *znode);
+struct ubifs_znode *ubifs_tnc_postorder_next(struct ubifs_znode *znode);
+long ubifs_destroy_tnc_subtree(struct ubifs_znode *zr);
+struct ubifs_znode *ubifs_load_znode(struct ubifs_info *c,
+                                     struct ubifs_zbranch *zbr,
+                                     struct ubifs_znode *parent, int iip);
+int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+                        void *node);
+/* tnc_commit.c */
+int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot);
+int ubifs_tnc_end_commit(struct ubifs_info *c);
+/* shrinker.c */
+int ubifs_shrinker(int nr_to_scan, gfp_t gfp_mask);
+/* commit.c */
+int ubifs_bg_thread(void *info);
+void ubifs_commit_required(struct ubifs_info *c);
+void ubifs_request_bg_commit(struct ubifs_info *c);
+int ubifs_run_commit(struct ubifs_info *c);
+void ubifs_recovery_commit(struct ubifs_info *c);
+int ubifs_gc_should_commit(struct ubifs_info *c);
+void ubifs_wait_for_commit(struct ubifs_info *c);
+/* master.c */
+int ubifs_read_master(struct ubifs_info *c);
+int ubifs_write_master(struct ubifs_info *c);
+/* sb.c */
+int ubifs_read_superblock(struct ubifs_info *c);
+struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c);
+int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup);
+/* replay.c */
+int ubifs_validate_entry(struct ubifs_info *c,
+                         const struct ubifs_dent_node *dent);
+int ubifs_replay_journal(struct ubifs_info *c);
+/* gc.c */
+int ubifs_garbage_collect(struct ubifs_info *c, int anyway);
+int ubifs_gc_start_commit(struct ubifs_info *c);
+int ubifs_gc_end_commit(struct ubifs_info *c);
+void ubifs_destroy_idx_gc(struct ubifs_info *c);
+int ubifs_get_idx_gc_leb(struct ubifs_info *c);
+int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp);
+/* orphan.c */
+int ubifs_add_orphan(struct ubifs_info *c, ino_t inum);
+void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum);
+int ubifs_orphan_start_commit(struct ubifs_info *c);
+int ubifs_orphan_end_commit(struct ubifs_info *c);
+int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only);
+/* lpt.c */
+int ubifs_calc_lpt_geom(struct ubifs_info *c);
+int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first,
+                          int *lpt_lebs, int *big_lpt);
+int ubifs_lpt_init(struct ubifs_info *c, int rd, int wr);
+struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum);
+struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum);
+int ubifs_lpt_scan_nolock(struct ubifs_info *c, int start_lnum, int end_lnum,
+                          ubifs_lpt_scan_callback scan_cb, void *data);
+/* Shared by lpt.c for lpt_commit.c */
+void ubifs_pack_lsave(struct ubifs_info *c, void *buf, int *lsave);
+void ubifs_pack_ltab(struct ubifs_info *c, void *buf,
+                     struct ubifs_lpt_lprops *ltab);
+void ubifs_pack_pnode(struct ubifs_info *c, void *buf,
+                      struct ubifs_pnode *pnode);
+void ubifs_pack_nnode(struct ubifs_info *c, void *buf,
+                      struct ubifs_nnode *nnode);
+struct ubifs_pnode *ubifs_get_pnode(struct ubifs_info *c,
+                                    struct ubifs_nnode *parent, int iip);
+struct ubifs_nnode *ubifs_get_nnode(struct ubifs_info *c,
+                                    struct ubifs_nnode *parent, int iip);
+int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip);
+void ubifs_add_lpt_dirt(struct ubifs_info *c, int lnum, int dirty);
+void ubifs_add_nnode_dirt(struct ubifs_info *c, struct ubifs_nnode *nnode);
+uint32_t ubifs_unpack_bits(uint8_t **addr, int *pos, int nrbits);
+struct ubifs_nnode *ubifs_first_nnode(struct ubifs_info *c, int *hght);
+/* lpt_commit.c */
+int ubifs_lpt_start_commit(struct ubifs_info *c);
+int ubifs_lpt_end_commit(struct ubifs_info *c);
+int ubifs_lpt_post_commit(struct ubifs_info *c);
+void ubifs_lpt_free(struct ubifs_info *c, int wr_only);
+/* lprops.c */
+void ubifs_get_lprops(struct ubifs_info *c);
+const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
+                                           const struct ubifs_lprops *lp,
+                                           int free, int dirty, int flags,
+                                           int idx_gc_cnt);
+void ubifs_release_lprops(struct ubifs_info *c);
+void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *stats);
+void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops,
+                      int cat);
+void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops,
+                       struct ubifs_lprops *new_lprops);
+void ubifs_ensure_cat(struct ubifs_info *c, struct ubifs_lprops *lprops);
+int ubifs_categorize_lprops(const struct ubifs_info *c,
+                            const struct ubifs_lprops *lprops);
+int ubifs_change_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
+                        int flags_set, int flags_clean, int idx_gc_cnt);
+int ubifs_update_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
+                        int flags_set, int flags_clean);
+int ubifs_read_one_lp(struct ubifs_info *c, int lnum, struct ubifs_lprops *lp);
+const struct ubifs_lprops *ubifs_fast_find_free(struct ubifs_info *c);
+const struct ubifs_lprops *ubifs_fast_find_empty(struct ubifs_info *c);
+const struct ubifs_lprops *ubifs_fast_find_freeable(struct ubifs_info *c);
+const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c);
+/* file.c */
+int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync);
+int ubifs_setattr(struct dentry *dentry, struct iattr *attr);
+/* dir.c */
+struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
+                              int mode);
+int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+                  struct kstat *stat);
+/* xattr.c */
+int ubifs_setxattr(struct dentry *dentry, const char *name,
+                   const void *value, size_t size, int flags);
+ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
+                       size_t size);
+ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size);
+int ubifs_removexattr(struct dentry *dentry, const char *name);
+/* super.c */
+struct inode *ubifs_iget(struct super_block *sb, unsigned long inum);
+/* recovery.c */
+int ubifs_recover_master_node(struct ubifs_info *c);
+int ubifs_write_rcvrd_mst_node(struct ubifs_info *c);
+struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
+                                         int offs, void *sbuf, int grouped);
+struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
+                                             int offs, void *sbuf);
+int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf);
+int ubifs_clean_lebs(const struct ubifs_info *c, void *sbuf);
+int ubifs_rcvry_gc_commit(struct ubifs_info *c);
+int ubifs_recover_size_accum(struct ubifs_info *c, union ubifs_key *key,
+                             int deletion, loff_t new_size);
+int ubifs_recover_size(struct ubifs_info *c);
+void ubifs_destroy_size_tree(struct ubifs_info *c);
+/* ioctl.c */
+long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+void ubifs_set_inode_flags(struct inode *inode);
+#ifdef CONFIG_COMPAT
+long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+#endif
+/* compressor.c */
+int __init ubifs_compressors_init(void);
+void __exit ubifs_compressors_exit(void);
+void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len,
+                    int *compr_type);
+int ubifs_decompress(const void *buf, int len, void *out, int *out_len,
+                     int compr_type);
+#include "debug.h"
+#include "misc.h"
+#include "key.h"
+#endif /* !__UBIFS_H__ */
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
new file mode 100644
index 000000000000..1388a078e1a9
--- /dev/null
+++ b/fs/ubifs/xattr.c
@@ -0,0 +1,581 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+/*
+ * This file implements UBIFS extended attributes support.
+ *
+ * Extended attributes are implemented as regular inodes with attached data,
+ * which limits extended attribute size to UBIFS block size (4KiB). Names of
+ * extended attributes are described by extended attribute entries (xentries),
+ * which are almost identical to directory entries, but have different key type.
+ *
+ * In other words, the situation with extended attributes is very similar to
+ * directories. Indeed, any inode (but of course not xattr inodes) may have a
+ * number of associated xentries, just like directory inodes have associated
+ * directory entries. Extended attribute entries store the name of the extended
+ * attribute, the host inode number, and the extended attribute inode number.
+ * Similarly, direntries store the name, the parent and the target inode
+ * numbers. Thus, most of the common UBIFS mechanisms may be re-used for
+ * extended attributes.
+ *
+ * The number of extended attributes is not limited, but there is Linux
+ * limitation on the maximum possible size of the list of all extended
+ * attributes associated with an inode (%XATTR_LIST_MAX), so UBIFS makes sure
+ * the sum of all extended attribute names of the inode does not exceed that
+ * limit.
+ *
+ * Extended attributes are synchronous, which means they are written to the
+ * flash media synchronously and there is no write-back for extended attribute
+ * inodes. The extended attribute values are not stored in compressed form on
+ * the media.
+ *
+ * Since extended attributes are represented by regular inodes, they are cached
+ * in the VFS inode cache. The xentries are cached in the LNC cache (see
+ * tnc.c).
+ *
+ * ACL support is not implemented.
+ */
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+#include "ubifs.h"
+/*
+ * Limit the number of extended attributes per inode so that the total size
+ * (xattr_size) is guaranteeded to fit in an 'unsigned int'.
+ */
+#define MAX_XATTRS_PER_INODE 65535
+/*
+ * Extended attribute type constants.
+ *
+ * USER_XATTR: user extended attribute ("user.*")
+ * TRUSTED_XATTR: trusted extended attribute ("trusted.*)
+ * SECURITY_XATTR: security extended attribute ("security.*")
+ */
+enum {
+        USER_XATTR,
+        TRUSTED_XATTR,
+        SECURITY_XATTR,
+};
+static struct inode_operations none_inode_operations;
+static struct address_space_operations none_address_operations;
+static struct file_operations none_file_operations;
+/**
+ * create_xattr - create an extended attribute.
+ * @c: UBIFS file-system description object
+ * @host: host inode
+ * @nm: extended attribute name
+ * @value: extended attribute value
+ * @size: size of extended attribute value
+ *
+ * This is a helper function which creates an extended attribute of name @nm
+ * and value @value for inode @host. The host inode is also updated on flash
+ * because the ctime and extended attribute accounting data changes. This
+ * function returns zero in case of success and a negative error code in case
+ * of failure.
+ */
+static int create_xattr(struct ubifs_info *c, struct inode *host,
+                        const struct qstr *nm, const void *value, int size)
+{
+        int err;
+        struct inode *inode;
+        struct ubifs_inode *ui, *host_ui = ubifs_inode(host);
+        struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
+                                        .new_ino_d = size, .dirtied_ino = 1,
+                                        .dirtied_ino_d = host_ui->data_len};
+        if (host_ui->xattr_cnt >= MAX_XATTRS_PER_INODE)
+                return -ENOSPC;
+        /*
+         * Linux limits the maximum size of the extended attribute names list
+         * to %XATTR_LIST_MAX. This means we should not allow creating more*
+         * extended attributes if the name list becomes larger. This limitation
+         * is artificial for UBIFS, though.
+         */
+        if (host_ui->xattr_names + host_ui->xattr_cnt +
+                                        nm->len + 1 > XATTR_LIST_MAX)
+                return -ENOSPC;
+        err = ubifs_budget_space(c, &req);
+        if (err)
+                return err;
+        inode = ubifs_new_inode(c, host, S_IFREG | S_IRWXUGO);
+        if (IS_ERR(inode)) {
+                err = PTR_ERR(inode);
+                goto out_budg;
+        }
+        mutex_lock(&host_ui->ui_mutex);
+        /* Re-define all operations to be "nothing" */
+        inode->i_mapping->a_ops = &none_address_operations;
+        inode->i_op = &none_inode_operations;
+        inode->i_fop = &none_file_operations;
+        inode->i_flags |= S_SYNC | S_NOATIME | S_NOCMTIME | S_NOQUOTA;
+        ui = ubifs_inode(inode);
+        ui->xattr = 1;
+        ui->flags |= UBIFS_XATTR_FL;
+        ui->data = kmalloc(size, GFP_NOFS);
+        if (!ui->data) {
+                err = -ENOMEM;
+                goto out_unlock;
+        }
+        memcpy(ui->data, value, size);
+        host->i_ctime = ubifs_current_time(host);
+        host_ui->xattr_cnt += 1;
+        host_ui->xattr_size += CALC_DENT_SIZE(nm->len);
+        host_ui->xattr_size += CALC_XATTR_BYTES(size);
+        host_ui->xattr_names += nm->len;
+        /*
+         * We do not use i_size_write() because nobody can race with us as we
+         * are holding host @host->i_mutex - every xattr operation for this
+         * inode is serialized by it.
+         */
+        inode->i_size = ui->ui_size = size;
+        ui->data_len = size;
+        err = ubifs_jnl_update(c, host, nm, inode, 0, 1);
+        if (err)
+                goto out_cancel;
+        mutex_unlock(&host_ui->ui_mutex);
+        ubifs_release_budget(c, &req);
+        insert_inode_hash(inode);
+        iput(inode);
+        return 0;
+out_cancel:
+        host_ui->xattr_cnt -= 1;
+        host_ui->xattr_size -= CALC_DENT_SIZE(nm->len);
+        host_ui->xattr_size -= CALC_XATTR_BYTES(size);
+out_unlock:
+        mutex_unlock(&host_ui->ui_mutex);
+        make_bad_inode(inode);
+        iput(inode);
+out_budg:
+        ubifs_release_budget(c, &req);
+        return err;
+}
+/**
+ * change_xattr - change an extended attribute.
+ * @c: UBIFS file-system description object
+ * @host: host inode
+ * @inode: extended attribute inode
+ * @value: extended attribute value
+ * @size: size of extended attribute value
+ *
+ * This helper function changes the value of extended attribute @inode with new
+ * data from @value. Returns zero in case of success and a negative error code
+ * in case of failure.
+ */
+static int change_xattr(struct ubifs_info *c, struct inode *host,
+                        struct inode *inode, const void *value, int size)
+{
+        int err;
+        struct ubifs_inode *host_ui = ubifs_inode(host);
+        struct ubifs_inode *ui = ubifs_inode(inode);
+        struct ubifs_budget_req req = { .dirtied_ino = 2,
+                                .dirtied_ino_d = size + host_ui->data_len };
+        ubifs_assert(ui->data_len == inode->i_size);
+        err = ubifs_budget_space(c, &req);
+        if (err)
+                return err;
+        mutex_lock(&host_ui->ui_mutex);
+        host->i_ctime = ubifs_current_time(host);
+        host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len);
+        host_ui->xattr_size += CALC_XATTR_BYTES(size);
+        kfree(ui->data);
+        ui->data = kmalloc(size, GFP_NOFS);
+        if (!ui->data) {
+                err = -ENOMEM;
+                goto out_unlock;
+        }
+        memcpy(ui->data, value, size);
+        inode->i_size = ui->ui_size = size;
+        ui->data_len = size;
+        /*
+         * It is important to write the host inode after the xattr inode
+         * because if the host inode gets synchronized (via 'fsync()'), then
+         * the extended attribute inode gets synchronized, because it goes
+         * before the host inode in the write-buffer.
+         */
+        err = ubifs_jnl_change_xattr(c, inode, host);
+        if (err)
+                goto out_cancel;
+        mutex_unlock(&host_ui->ui_mutex);
+        ubifs_release_budget(c, &req);
+        return 0;
+out_cancel:
+        host_ui->xattr_size -= CALC_XATTR_BYTES(size);
+        host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len);
+        make_bad_inode(inode);
+out_unlock:
+        mutex_unlock(&host_ui->ui_mutex);
+        ubifs_release_budget(c, &req);
+        return err;
+}
+/**
+ * check_namespace - check extended attribute name-space.
+ * @nm: extended attribute name
+ *
+ * This function makes sure the extended attribute name belongs to one of the
+ * supported extended attribute name-spaces. Returns name-space index in case
+ * of success and a negative error code in case of failure.
+ */
+static int check_namespace(const struct qstr *nm)
+{
+        int type;
+        if (nm->len > UBIFS_MAX_NLEN)
+                return -ENAMETOOLONG;
+        if (!strncmp(nm->name, XATTR_TRUSTED_PREFIX,
+                     XATTR_TRUSTED_PREFIX_LEN)) {
+                if (nm->name[sizeof(XATTR_TRUSTED_PREFIX) - 1] == '\0')
+                        return -EINVAL;
+                type = TRUSTED_XATTR;
+        } else if (!strncmp(nm->name, XATTR_USER_PREFIX,
+                                      XATTR_USER_PREFIX_LEN)) {
+                if (nm->name[XATTR_USER_PREFIX_LEN] == '\0')
+                        return -EINVAL;
+                type = USER_XATTR;
+        } else if (!strncmp(nm->name, XATTR_SECURITY_PREFIX,
+                                     XATTR_SECURITY_PREFIX_LEN)) {
+                if (nm->name[sizeof(XATTR_SECURITY_PREFIX) - 1] == '\0')
+                        return -EINVAL;
+                type = SECURITY_XATTR;
+        } else
+                return -EOPNOTSUPP;
+        return type;
+}
+static struct inode *iget_xattr(struct ubifs_info *c, ino_t inum)
+{
+        struct inode *inode;
+        inode = ubifs_iget(c->vfs_sb, inum);
+        if (IS_ERR(inode)) {
+                ubifs_err("dead extended attribute entry, error %d",
+                          (int)PTR_ERR(inode));
+                return inode;
+        }
+        if (ubifs_inode(inode)->xattr)
+                return inode;
+        ubifs_err("corrupt extended attribute entry");
+        iput(inode);
+        return ERR_PTR(-EINVAL);
+}
+int ubifs_setxattr(struct dentry *dentry, const char *name,
+                   const void *value, size_t size, int flags)
+{
+        struct inode *inode, *host = dentry->d_inode;
+        struct ubifs_info *c = host->i_sb->s_fs_info;
+        struct qstr nm = { .name = name, .len = strlen(name) };
+        struct ubifs_dent_node *xent;
+        union ubifs_key key;
+        int err, type;
+        dbg_gen("xattr '%s', host ino %lu ('%.*s'), size %zd", name,
+                host->i_ino, dentry->d_name.len, dentry->d_name.name, size);
+        if (size > UBIFS_MAX_INO_DATA)
+                return -ERANGE;
+        type = check_namespace(&nm);
+        if (type < 0)
+                return type;
+        xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS);
+        if (!xent)
+                return -ENOMEM;
+        /*
+         * The extended attribute entries are stored in LNC, so multiple
+         * look-ups do not involve reading the flash.
+         */
+        xent_key_init(c, &key, host->i_ino, &nm);
+        err = ubifs_tnc_lookup_nm(c, &key, xent, &nm);
+        if (err) {
+                if (err != -ENOENT)
+                        goto out_free;
+                if (flags & XATTR_REPLACE)
+                        /* We are asked not to create the xattr */
+                        err = -ENODATA;
+                else
+                        err = create_xattr(c, host, &nm, value, size);
+                goto out_free;
+        }
+        if (flags & XATTR_CREATE) {
+                /* We are asked not to replace the xattr */
+                err = -EEXIST;
+                goto out_free;
+        }
+        inode = iget_xattr(c, le64_to_cpu(xent->inum));
+        if (IS_ERR(inode)) {
+                err = PTR_ERR(inode);
+                goto out_free;
+        }
+        err = change_xattr(c, host, inode, value, size);
+        iput(inode);
+out_free:
+        kfree(xent);
+        return err;
+}
+ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
+                       size_t size)
+{
+        struct inode *inode, *host = dentry->d_inode;
+        struct ubifs_info *c = host->i_sb->s_fs_info;
+        struct qstr nm = { .name = name, .len = strlen(name) };
+        struct ubifs_inode *ui;
+        struct ubifs_dent_node *xent;
+        union ubifs_key key;
+        int err;
+        dbg_gen("xattr '%s', ino %lu ('%.*s'), buf size %zd", name,
+                host->i_ino, dentry->d_name.len, dentry->d_name.name, size);
+        err = check_namespace(&nm);
+        if (err < 0)
+                return err;
+        xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS);
+        if (!xent)
+                return -ENOMEM;
+        mutex_lock(&host->i_mutex);
+        xent_key_init(c, &key, host->i_ino, &nm);
+        err = ubifs_tnc_lookup_nm(c, &key, xent, &nm);
+        if (err) {
+                if (err == -ENOENT)
+                        err = -ENODATA;
+                goto out_unlock;
+        }
+        inode = iget_xattr(c, le64_to_cpu(xent->inum));
+        if (IS_ERR(inode)) {
+                err = PTR_ERR(inode);
+                goto out_unlock;
+        }
+        ui = ubifs_inode(inode);
+        ubifs_assert(inode->i_size == ui->data_len);
+        ubifs_assert(ubifs_inode(host)->xattr_size > ui->data_len);
+        if (buf) {
+                /* If @buf is %NULL we are supposed to return the length */
+                if (ui->data_len > size) {
+                        dbg_err("buffer size %zd, xattr len %d",
+                                size, ui->data_len);
+                        err = -ERANGE;
+                        goto out_iput;
+                }
+                memcpy(buf, ui->data, ui->data_len);
+        }
+        err = ui->data_len;
+out_iput:
+        iput(inode);
+out_unlock:
+        mutex_unlock(&host->i_mutex);
+        kfree(xent);
+        return err;
+}
+ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+        union ubifs_key key;
+        struct inode *host = dentry->d_inode;
+        struct ubifs_info *c = host->i_sb->s_fs_info;
+        struct ubifs_inode *host_ui = ubifs_inode(host);
+        struct ubifs_dent_node *xent, *pxent = NULL;
+        int err, len, written = 0;
+        struct qstr nm = { .name = NULL };
+        dbg_gen("ino %lu ('%.*s'), buffer size %zd", host->i_ino,
+                dentry->d_name.len, dentry->d_name.name, size);
+        len = host_ui->xattr_names + host_ui->xattr_cnt;
+        if (!buffer)
+                /*
+                 * We should return the minimum buffer size which will fit a
+                 * null-terminated list of all the extended attribute names.
+                 */
+                return len;
+        if (len > size)
+                return -ERANGE;
+        lowest_xent_key(c, &key, host->i_ino);
+        mutex_lock(&host->i_mutex);
+        while (1) {
+                int type;
+                xent = ubifs_tnc_next_ent(c, &key, &nm);
+                if (unlikely(IS_ERR(xent))) {
+                        err = PTR_ERR(xent);
+                        break;
+                }
+                nm.name = xent->name;
+                nm.len = le16_to_cpu(xent->nlen);
+                type = check_namespace(&nm);
+                if (unlikely(type < 0)) {
+                        err = type;
+                        break;
+                }
+                /* Show trusted namespace only for "power" users */
+                if (type != TRUSTED_XATTR || capable(CAP_SYS_ADMIN)) {
+                        memcpy(buffer + written, nm.name, nm.len + 1);
+                        written += nm.len + 1;
+                }
+                kfree(pxent);
+                pxent = xent;
+                key_read(c, &xent->key, &key);
+        }
+        mutex_unlock(&host->i_mutex);
+        kfree(pxent);
+        if (err != -ENOENT) {
+                ubifs_err("cannot find next direntry, error %d", err);
+                return err;
+        }
+        ubifs_assert(written <= size);
+        return written;
+}
+static int remove_xattr(struct ubifs_info *c, struct inode *host,
+                        struct inode *inode, const struct qstr *nm)
+{
+        int err;
+        struct ubifs_inode *host_ui = ubifs_inode(host);
+        struct ubifs_inode *ui = ubifs_inode(inode);
+        struct ubifs_budget_req req = { .dirtied_ino = 1, .mod_dent = 1,
+                                        .dirtied_ino_d = host_ui->data_len };
+        ubifs_assert(ui->data_len == inode->i_size);
+        err = ubifs_budget_space(c, &req);
+        if (err)
+                return err;
+        mutex_lock(&host_ui->ui_mutex);
+        host->i_ctime = ubifs_current_time(host);
+        host_ui->xattr_cnt -= 1;
+        host_ui->xattr_size -= CALC_DENT_SIZE(nm->len);
+        host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len);
+        host_ui->xattr_names -= nm->len;
+        err = ubifs_jnl_delete_xattr(c, host, inode, nm);
+        if (err)
+                goto out_cancel;
+        mutex_unlock(&host_ui->ui_mutex);
+        ubifs_release_budget(c, &req);
+        return 0;
+out_cancel:
+        host_ui->xattr_cnt += 1;
+        host_ui->xattr_size += CALC_DENT_SIZE(nm->len);
+        host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len);
+        mutex_unlock(&host_ui->ui_mutex);
+        ubifs_release_budget(c, &req);
+        make_bad_inode(inode);
+        return err;
+}
+int ubifs_removexattr(struct dentry *dentry, const char *name)
+{
+        struct inode *inode, *host = dentry->d_inode;
+        struct ubifs_info *c = host->i_sb->s_fs_info;
+        struct qstr nm = { .name = name, .len = strlen(name) };
+        struct ubifs_dent_node *xent;
+        union ubifs_key key;
+        int err;
+        dbg_gen("xattr '%s', ino %lu ('%.*s')", name,
+                host->i_ino, dentry->d_name.len, dentry->d_name.name);
+        ubifs_assert(mutex_is_locked(&host->i_mutex));
+        err = check_namespace(&nm);
+        if (err < 0)
+                return err;
+        xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS);
+        if (!xent)
+                return -ENOMEM;
+        xent_key_init(c, &key, host->i_ino, &nm);
+        err = ubifs_tnc_lookup_nm(c, &key, xent, &nm);
+        if (err) {
+                if (err == -ENOENT)
+                        err = -ENODATA;
+                goto out_free;
+        }
+        inode = iget_xattr(c, le64_to_cpu(xent->inum));
+        if (IS_ERR(inode)) {
+                err = PTR_ERR(inode);
+                goto out_free;
+        }
+        ubifs_assert(inode->i_nlink == 1);
+        inode->i_nlink = 0;
+        err = remove_xattr(c, host, inode, &nm);
+        if (err)
+                inode->i_nlink = 1;
+        /* If @i_nlink is 0, 'iput()' will delete the inode */
+        iput(inode);
+out_free:
+        kfree(xent);
+        return err;
+}
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 7a5f69be6ac2..44cc702f96cc 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -682,38 +682,26 @@ static int udf_vrs(struct super_block *sb, int silent)
 /*
 * Check whether there is an anchor block in the given block
 */
-static int udf_check_anchor_block(struct super_block *sb, sector_t block,
+static int udf_check_anchor_block(struct super_block *sb, sector_t block)
-                                        bool varconv)
 {
-        struct buffer_head *bh = NULL;
+        struct buffer_head *bh;
-        tag *t;
        uint16_t ident;
-        uint32_t location;
-        if (varconv) {
+        if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) &&
-                if (udf_fixed_to_variable(block) >=
+            udf_fixed_to_variable(block) >=
-                    sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits)
+            sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits)
-                        return 0;
+                return 0;
-                bh = sb_bread(sb, udf_fixed_to_variable(block));
-        }
-        else
-                bh = sb_bread(sb, block);
+        bh = udf_read_tagged(sb, block, block, &ident);
        if (!bh)
                return 0;
-        t = (tag *)bh->b_data;
-        ident = le16_to_cpu(t->tagIdent);
-        location = le32_to_cpu(t->tagLocation);
        brelse(bh);
-        if (ident != TAG_IDENT_AVDP)
-                return 0;
+        return ident == TAG_IDENT_AVDP;
-        return location == block;
 }
 /* Search for an anchor volume descriptor pointer */
-static sector_t udf_scan_anchors(struct super_block *sb, bool varconv,
+static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock)
-                                        sector_t lastblock)
 {
        sector_t last[6];
        int i;
@@ -739,7 +727,7 @@ static sector_t udf_scan_anchors(struct super_block *sb, bool varconv,
                                sb->s_blocksize_bits)
                        continue;
-                if (udf_check_anchor_block(sb, last[i], varconv)) {
+                if (udf_check_anchor_block(sb, last[i])) {
                        sbi->s_anchor[0] = last[i];
                        sbi->s_anchor[1] = last[i] - 256;
                        return last[i];
@@ -748,17 +736,17 @@ static sector_t udf_scan_anchors(struct super_block *sb, bool varconv,
                if (last[i] < 256)
                        continue;
-                if (udf_check_anchor_block(sb, last[i] - 256, varconv)) {
+                if (udf_check_anchor_block(sb, last[i] - 256)) {
                        sbi->s_anchor[1] = last[i] - 256;
                        return last[i];
                }
        }
-        if (udf_check_anchor_block(sb, sbi->s_session + 256, varconv)) {
+        if (udf_check_anchor_block(sb, sbi->s_session + 256)) {
                sbi->s_anchor[0] = sbi->s_session + 256;
                return last[0];
        }
-        if (udf_check_anchor_block(sb, sbi->s_session + 512, varconv)) {
+        if (udf_check_anchor_block(sb, sbi->s_session + 512)) {
                sbi->s_anchor[0] = sbi->s_session + 512;
                return last[0];
        }
@@ -780,23 +768,24 @@ static void udf_find_anchor(struct super_block *sb)
        int i;
        struct udf_sb_info *sbi = UDF_SB(sb);
-        lastblock = udf_scan_anchors(sb, 0, sbi->s_last_block);
+        lastblock = udf_scan_anchors(sb, sbi->s_last_block);
        if (lastblock)
                goto check_anchor;
        /* No anchor found? Try VARCONV conversion of block numbers */
+        UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
        /* Firstly, we try to not convert number of the last block */
-        lastblock = udf_scan_anchors(sb, 1,
+        lastblock = udf_scan_anchors(sb,
                                udf_variable_to_fixed(sbi->s_last_block));
-        if (lastblock) {
+        if (lastblock)
-                UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
                goto check_anchor;
-        }
        /* Secondly, we try with converted number of the last block */
-        lastblock = udf_scan_anchors(sb, 1, sbi->s_last_block);
+        lastblock = udf_scan_anchors(sb, sbi->s_last_block);
-        if (lastblock)
+        if (!lastblock) {
-                UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
+                /* VARCONV didn't help. Clear it. */
+                UDF_CLEAR_FLAG(sb, UDF_FLAG_VARCONV);
+        }
 check_anchor:
        /*
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 8fa9c2d70911..8ec865de5f13 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -16,7 +16,7 @@
 #define UDF_PREALLOCATE
 #define UDF_DEFAULT_PREALLOC_BLOCKS     8
-#define UDFFS_DEBUG
+#undef UDFFS_DEBUG
 #ifdef UDFFS_DEBUG
 #define udf_debug(f, a...) \
diff --git a/fs/utimes.c b/fs/utimes.c
index af059d5cb485..b6b664e7145e 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -40,14 +40,9 @@ asmlinkage long sys_utime(char __user *filename, struct utimbuf __user *times)
 #endif
-static bool nsec_special(long nsec)
-{
-        return nsec == UTIME_OMIT || nsec == UTIME_NOW;
-}
 static bool nsec_valid(long nsec)
 {
-        if (nsec_special(nsec))
+        if (nsec == UTIME_OMIT || nsec == UTIME_NOW)
                return true;
        return nsec >= 0 && nsec <= 999999999;
@@ -102,7 +97,11 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
        if (error)
                goto dput_and_out;
-        /* Don't worry, the checks are done in inode_change_ok() */
+        if (times && times[0].tv_nsec == UTIME_NOW &&
+                     times[1].tv_nsec == UTIME_NOW)
+                times = NULL;
+        /* In most cases, the checks are done in inode_change_ok() */
        newattrs.ia_valid = ATTR_CTIME | ATTR_MTIME | ATTR_ATIME;
        if (times) {
                error = -EPERM;
@@ -124,28 +123,34 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
                        newattrs.ia_mtime.tv_nsec = times[1].tv_nsec;
                        newattrs.ia_valid |= ATTR_MTIME_SET;
                }
-        }
-        /*
+                /*
-         * If times is NULL or both times are either UTIME_OMIT or
+                 * For the UTIME_OMIT/UTIME_NOW and UTIME_NOW/UTIME_OMIT
-         * UTIME_NOW, then need to check permissions, because
+                 * cases, we need to make an extra check that is not done by
-         * inode_change_ok() won't do it.
+                 * inode_change_ok().
-         */
+                 */
-        if (!times || (nsec_special(times[0].tv_nsec) &&
+                if (((times[0].tv_nsec == UTIME_NOW &&
-                       nsec_special(times[1].tv_nsec))) {
+                            times[1].tv_nsec == UTIME_OMIT)
+                     ||
+                     (times[0].tv_nsec == UTIME_OMIT &&
+                            times[1].tv_nsec == UTIME_NOW))
+                    && !is_owner_or_cap(inode))
+                        goto mnt_drop_write_and_out;
+        } else {
+                /*
+                 * If times is NULL (or both times are UTIME_NOW),
+                 * then we need to check permissions, because
+                 * inode_change_ok() won't do it.
+                 */
                error = -EACCES;
                if (IS_IMMUTABLE(inode))
                        goto mnt_drop_write_and_out;
                if (!is_owner_or_cap(inode)) {
-                        if (f) {
+                        error = permission(inode, MAY_WRITE, NULL);
-                                if (!(f->f_mode & FMODE_WRITE))
+                        if (error)
-                                        goto mnt_drop_write_and_out;
+                                goto mnt_drop_write_and_out;
-                        } else {
-                                error = vfs_permission(&nd, MAY_WRITE);
-                                if (error)
-                                        goto mnt_drop_write_and_out;
-                        }
                }
        }
        mutex_lock(&inode->i_mutex);
@@ -169,14 +174,6 @@ asmlinkage long sys_utimensat(int dfd, char __user *filename, struct timespec __
        if (utimes) {
                if (copy_from_user(&tstimes, utimes, sizeof(tstimes)))
                        return -EFAULT;
-                if ((tstimes[0].tv_nsec == UTIME_OMIT ||
-                     tstimes[0].tv_nsec == UTIME_NOW) &&
-                    tstimes[0].tv_sec != 0)
-                        return -EINVAL;
-                if ((tstimes[1].tv_nsec == UTIME_OMIT ||
-                     tstimes[1].tv_nsec == UTIME_NOW) &&
-                    tstimes[1].tv_sec != 0)
-                        return -EINVAL;
                /* Nothing to do, we must not even check the path.  */
                if (tstimes[0].tv_nsec == UTIME_OMIT &&
diff --git a/fs/vfat/namei.c b/fs/vfat/namei.c
index a3522727ea5b..b546ba69be82 100644
--- a/fs/vfat/namei.c
+++ b/fs/vfat/namei.c
@@ -645,7 +645,7 @@ static int vfat_add_entry(struct inode *dir, struct qstr *qname, int is_dir,
        if (len == 0)
                return -ENOENT;
-        slots = kmalloc(sizeof(*slots) * MSDOS_SLOTS, GFP_KERNEL);
+        slots = kmalloc(sizeof(*slots) * MSDOS_SLOTS, GFP_NOFS);
        if (slots == NULL)
                return -ENOMEM;
@@ -687,7 +687,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
        struct dentry *alias;
        int err, table;
-        lock_kernel();
+        lock_super(sb);
        table = (MSDOS_SB(sb)->options.name_check == 's') ? 2 : 0;
        dentry->d_op = &vfat_dentry_ops[table];
@@ -699,7 +699,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
        inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
        brelse(sinfo.bh);
        if (IS_ERR(inode)) {
-                unlock_kernel();
+                unlock_super(sb);
                return ERR_CAST(inode);
        }
        alias = d_find_alias(inode);
@@ -708,13 +708,13 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
                        dput(alias);
                else {
                        iput(inode);
-                        unlock_kernel();
+                        unlock_super(sb);
                        return alias;
                }
        }
 error:
-        unlock_kernel();
+        unlock_super(sb);
        dentry->d_op = &vfat_dentry_ops[table];
        dentry->d_time = dentry->d_parent->d_inode->i_version;
        dentry = d_splice_alias(inode, dentry);
@@ -734,7 +734,7 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, int mode,
        struct timespec ts;
        int err;
-        lock_kernel();
+        lock_super(sb);
        ts = CURRENT_TIME_SEC;
        err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &ts, &sinfo);
@@ -755,17 +755,18 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, int mode,
        dentry->d_time = dentry->d_parent->d_inode->i_version;
        d_instantiate(dentry, inode);
 out:
-        unlock_kernel();
+        unlock_super(sb);
        return err;
 }
 static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
 {
        struct inode *inode = dentry->d_inode;
+        struct super_block *sb = dir->i_sb;
        struct fat_slot_info sinfo;
        int err;
-        lock_kernel();
+        lock_super(sb);
        err = fat_dir_empty(inode);
        if (err)
@@ -783,7 +784,7 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
        inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
        fat_detach(inode);
 out:
-        unlock_kernel();
+        unlock_super(sb);
        return err;
 }
@@ -791,10 +792,11 @@ out:
 static int vfat_unlink(struct inode *dir, struct dentry *dentry)
 {
        struct inode *inode = dentry->d_inode;
+        struct super_block *sb = dir->i_sb;
        struct fat_slot_info sinfo;
        int err;
-        lock_kernel();
+        lock_super(sb);
        err = vfat_find(dir, &dentry->d_name, &sinfo);
        if (err)
@@ -807,7 +809,7 @@ static int vfat_unlink(struct inode *dir, struct dentry *dentry)
        inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
        fat_detach(inode);
 out:
-        unlock_kernel();
+        unlock_super(sb);
        return err;
 }
@@ -820,7 +822,7 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        struct timespec ts;
        int err, cluster;
-        lock_kernel();
+        lock_super(sb);
        ts = CURRENT_TIME_SEC;
        cluster = fat_alloc_new_dir(dir, &ts);
@@ -849,13 +851,13 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        dentry->d_time = dentry->d_parent->d_inode->i_version;
        d_instantiate(dentry, inode);
-        unlock_kernel();
+        unlock_super(sb);
        return 0;
 out_free:
        fat_free_clusters(dir, cluster);
 out:
-        unlock_kernel();
+        unlock_super(sb);
        return err;
 }
@@ -869,11 +871,12 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct timespec ts;
        loff_t dotdot_i_pos, new_i_pos;
        int err, is_dir, update_dotdot, corrupt = 0;
+        struct super_block *sb = old_dir->i_sb;
        old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
        old_inode = old_dentry->d_inode;
        new_inode = new_dentry->d_inode;
-        lock_kernel();
+        lock_super(sb);
        err = vfat_find(old_dir, &old_dentry->d_name, &old_sinfo);
        if (err)
                goto out;
@@ -951,7 +954,7 @@ out:
        brelse(sinfo.bh);
        brelse(dotdot_bh);
        brelse(old_sinfo.bh);
-        unlock_kernel();
+        unlock_super(sb);
        return err;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index afaee301b0ee..ad3d26ddfe31 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -2427,13 +2427,20 @@ restart:
        if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) {
                xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
-                /* If I'm the only one writing to this iclog, sync it to disk */
+                /*
-                if (atomic_read(&iclog->ic_refcnt) == 1) {
+                 * If I'm the only one writing to this iclog, sync it to disk.
+                 * We need to do an atomic compare and decrement here to avoid
+                 * racing with concurrent atomic_dec_and_lock() calls in
+                 * xlog_state_release_iclog() when there is more than one
+                 * reference to the iclog.
+                 */
+                if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) {
+                        /* we are the only one */
                        spin_unlock(&log->l_icloglock);
-                        if ((error = xlog_state_release_iclog(log, iclog)))
+                        error = xlog_state_release_iclog(log, iclog);
+                        if (error)
                                return error;
                } else {
-                        atomic_dec(&iclog->ic_refcnt);
                        spin_unlock(&log->l_icloglock);
                }
                goto restart;