117 files changed, 6266 insertions, 4246 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index a52cf6280b4b..17216ba99c85 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -930,7 +930,7 @@ config PROC_KCORE
 config PROC_VMCORE
        bool "/proc/vmcore support (EXPERIMENTAL)"
-        depends on PROC_FS && EXPERIMENTAL && CRASH_DUMP
+        depends on PROC_FS && CRASH_DUMP
        default y
        help
        Exports the dump image of crashed kernel in ELF format.
@@ -1547,10 +1547,6 @@ config UFS_FS
          The recently released UFS2 variant (used in FreeBSD 5.x) is
          READ-ONLY supported.
-          If you only intend to mount files from some other Unix over the
-          network using NFS, you don't need the UFS file system support (but
-          you need NFS file system support obviously).
          Note that this option is generally not needed for floppies, since a
          good portable way to transport files and directories between unixes
          (and even other operating systems) is given by the tar program ("man
@@ -1590,6 +1586,7 @@ menuconfig NETWORK_FILESYSTEMS
          Say Y here to get to see options for network filesystems and
          filesystem-related networking code, such as NFS daemon and
          RPCSEC security modules.
          This option alone does not add any kernel code.
          If you say N, all options in this submenu will be skipped and
@@ -1598,76 +1595,92 @@ menuconfig NETWORK_FILESYSTEMS
 if NETWORK_FILESYSTEMS
 config NFS_FS
-        tristate "NFS file system support"
+        tristate "NFS client support"
        depends on INET
        select LOCKD
        select SUNRPC
        select NFS_ACL_SUPPORT if NFS_V3_ACL
        help
-          If you are connected to some other (usually local) Unix computer
+          Choose Y here if you want to access files residing on other
-          (using SLIP, PLIP, PPP or Ethernet) and want to mount files residing
+          computers using Sun's Network File System protocol.  To compile
-          on that computer (the NFS server) using the Network File Sharing
+          this file system support as a module, choose M here: the module
-          protocol, say Y. "Mounting files" means that the client can access
+          will be called nfs.
-          the files with usual UNIX commands as if they were sitting on the
-          client's hard disk. For this to work, the server must run the
-          programs nfsd and mountd (but does not need to have NFS file system
-          support enabled in its kernel). NFS is explained in the Network
-          Administrator's Guide, available from
-          <http://www.tldp.org/docs.html#guide>, on its man page: "man
-          nfs", and in the NFS-HOWTO.
-          A superior but less widely used alternative to NFS is provided by
-          the Coda file system; see "Coda file system support" below.
-          If you say Y here, you should have said Y to TCP/IP networking also.
+          To mount file systems exported by NFS servers, you also need to
-          This option would enlarge your kernel by about 27 KB.
+          install the user space mount.nfs command which can be found in
+          the Linux nfs-utils package, available from http://linux-nfs.org/.
+          Information about using the mount command is available in the
+          mount(8) man page.  More detail about the Linux NFS client
+          implementation is available via the nfs(5) man page.
-          To compile this file system support as a module, choose M here: the
+          Below you can choose which versions of the NFS protocol are
-          module will be called nfs.
+          available in the kernel to mount NFS servers.  Support for NFS
+          version 2 (RFC 1094) is always available when NFS_FS is selected.
-          If you are configuring a diskless machine which will mount its root
+          To configure a system which mounts its root file system via NFS
-          file system over NFS at boot time, say Y here and to "Kernel
+          at boot time, say Y here, select "Kernel level IP
-          level IP autoconfiguration" above and to "Root file system on NFS"
+          autoconfiguration" in the NETWORK menu, and select "Root file
-          below. You cannot compile this driver as a module in this case.
+          system on NFS" below.  You cannot compile this file system as a
-          There are two packages designed for booting diskless machines over
+          module in this case.
-          the net: netboot, available from
-          <http://ftp1.sourceforge.net/netboot/>, and Etherboot,
-          available from <http://ftp1.sourceforge.net/etherboot/>.
-          If you don't know what all this is about, say N.
+          If unsure, say N.
 config NFS_V3
-        bool "Provide NFSv3 client support"
+        bool "NFS client support for NFS version 3"
        depends on NFS_FS
        help
-          Say Y here if you want your NFS client to be able to speak version
+          This option enables support for version 3 of the NFS protocol
-          3 of the NFS protocol.
+          (RFC 1813) in the kernel's NFS client.
          If unsure, say Y.
 config NFS_V3_ACL
-        bool "Provide client support for the NFSv3 ACL protocol extension"
+        bool "NFS client support for the NFSv3 ACL protocol extension"
        depends on NFS_V3
        help
-          Implement the NFSv3 ACL protocol extension for manipulating POSIX
+          Some NFS servers support an auxiliary NFSv3 ACL protocol that
-          Access Control Lists.  The server should also be compiled with
+          Sun added to Solaris but never became an official part of the
-          the NFSv3 ACL protocol extension; see the CONFIG_NFSD_V3_ACL option.
+          NFS version 3 protocol.  This protocol extension allows
+          applications on NFS clients to manipulate POSIX Access Control
+          Lists on files residing on NFS servers.  NFS servers enforce
+          ACLs on local files whether this protocol is available or not.
+          Choose Y here if your NFS server supports the Solaris NFSv3 ACL
+          protocol extension and you want your NFS client to allow
+          applications to access and modify ACLs on files on the server.
+          Most NFS servers don't support the Solaris NFSv3 ACL protocol
+          extension.  You can choose N here or specify the "noacl" mount
+          option to prevent your NFS client from trying to use the NFSv3
+          ACL protocol.
          If unsure, say N.
 config NFS_V4
-        bool "Provide NFSv4 client support (EXPERIMENTAL)"
+        bool "NFS client support for NFS version 4 (EXPERIMENTAL)"
        depends on NFS_FS && EXPERIMENTAL
        select RPCSEC_GSS_KRB5
        help
-          Say Y here if you want your NFS client to be able to speak the newer
+          This option enables support for version 4 of the NFS protocol
-          version 4 of the NFS protocol.
+          (RFC 3530) in the kernel's NFS client.
-          Note: Requires auxiliary userspace daemons which may be found on
+          To mount NFS servers using NFSv4, you also need to install user
-                http://www.citi.umich.edu/projects/nfsv4/
+          space programs which can be found in the Linux nfs-utils package,
+          available from http://linux-nfs.org/.
          If unsure, say N.
+config ROOT_NFS
+        bool "Root file system on NFS"
+        depends on NFS_FS=y && IP_PNP
+        help
+          If you want your system to mount its root file system via NFS,
+          choose Y here.  This is common practice for managing systems
+          without local permanent storage.  For details, read
+          <file:Documentation/filesystems/nfsroot.txt>.
+          Most people say N here.
 config NFSD
        tristate "NFS server support"
        depends on INET
@@ -1749,20 +1762,6 @@ config NFSD_V4
          If unsure, say N.
-config ROOT_NFS
-        bool "Root file system on NFS"
-        depends on NFS_FS=y && IP_PNP
-        help
-          If you want your Linux box to mount its whole root file system (the
-          one containing the directory /) from some other computer over the
-          net via NFS (presumably because your box doesn't have a hard disk),
-          say Y. Read <file:Documentation/filesystems/nfsroot.txt> for
-          details. It is likely that in this case, you also want to say Y to
-          "Kernel level IP autoconfiguration" so that your box can discover
-          its network address at boot time.
-          Most people say N here.
 config LOCKD
        tristate
@@ -1803,27 +1802,6 @@ config SUNRPC_XPRT_RDMA
          If unsure, say N.
-config SUNRPC_BIND34
-        bool "Support for rpcbind versions 3 & 4 (EXPERIMENTAL)"
-        depends on SUNRPC && EXPERIMENTAL
-        default n
-        help
-          RPC requests over IPv6 networks require support for larger
-          addresses when performing an RPC bind.  Sun added support for
-          IPv6 addressing by creating two new versions of the rpcbind
-          protocol (RFC 1833).
-          This option enables support in the kernel RPC client for
-          querying rpcbind servers via versions 3 and 4 of the rpcbind
-          protocol.  The kernel automatically falls back to version 2
-          if a remote rpcbind service does not support versions 3 or 4.
-          By themselves, these new versions do not provide support for
-          RPC over IPv6, but the new protocol versions are necessary to
-          support it.
-          If unsure, say N to get traditional behavior (version 2 rpcbind
-          requests only).
 config RPCSEC_GSS_KRB5
        tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)"
        depends on SUNRPC && EXPERIMENTAL
diff --git a/fs/Makefile b/fs/Makefile
index fcae06aaadc5..3b2178b4bb66 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -19,6 +19,7 @@ else
 obj-y +=        no-block.o
 endif
+obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o
 obj-$(CONFIG_INOTIFY)           += inotify.o
 obj-$(CONFIG_INOTIFY_USER)      += inotify_user.o
 obj-$(CONFIG_EPOLL)             += eventpoll.o
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
new file mode 100644
index 000000000000..63e2ee63058d
--- /dev/null
+++ b/fs/bio-integrity.c
@@ -0,0 +1,719 @@
+/*
+ * bio-integrity.c - bio data integrity extensions
+ *
+ * Copyright (C) 2007, 2008 Oracle Corporation
+ * Written by: Martin K. Petersen <martin.petersen@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
+ * USA.
+ *
+ */
+#include <linux/blkdev.h>
+#include <linux/mempool.h>
+#include <linux/bio.h>
+#include <linux/workqueue.h>
+static struct kmem_cache *bio_integrity_slab __read_mostly;
+static struct workqueue_struct *kintegrityd_wq;
+/**
+ * bio_integrity_alloc_bioset - Allocate integrity payload and attach it to bio
+ * @bio:        bio to attach integrity metadata to
+ * @gfp_mask:   Memory allocation mask
+ * @nr_vecs:    Number of integrity metadata scatter-gather elements
+ * @bs:         bio_set to allocate from
+ *
+ * Description: This function prepares a bio for attaching integrity
+ * metadata.  nr_vecs specifies the maximum number of pages containing
+ * integrity metadata that can be attached.
+ */
+struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
+                                                         gfp_t gfp_mask,
+                                                         unsigned int nr_vecs,
+                                                         struct bio_set *bs)
+{
+        struct bio_integrity_payload *bip;
+        struct bio_vec *iv;
+        unsigned long idx;
+        BUG_ON(bio == NULL);
+        bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
+        if (unlikely(bip == NULL)) {
+                printk(KERN_ERR "%s: could not alloc bip\n", __func__);
+                return NULL;
+        }
+        memset(bip, 0, sizeof(*bip));
+        iv = bvec_alloc_bs(gfp_mask, nr_vecs, &idx, bs);
+        if (unlikely(iv == NULL)) {
+                printk(KERN_ERR "%s: could not alloc bip_vec\n", __func__);
+                mempool_free(bip, bs->bio_integrity_pool);
+                return NULL;
+        }
+        bip->bip_pool = idx;
+        bip->bip_vec = iv;
+        bip->bip_bio = bio;
+        bio->bi_integrity = bip;
+        return bip;
+}
+EXPORT_SYMBOL(bio_integrity_alloc_bioset);
+/**
+ * bio_integrity_alloc - Allocate integrity payload and attach it to bio
+ * @bio:        bio to attach integrity metadata to
+ * @gfp_mask:   Memory allocation mask
+ * @nr_vecs:    Number of integrity metadata scatter-gather elements
+ *
+ * Description: This function prepares a bio for attaching integrity
+ * metadata.  nr_vecs specifies the maximum number of pages containing
+ * integrity metadata that can be attached.
+ */
+struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
+                                                  gfp_t gfp_mask,
+                                                  unsigned int nr_vecs)
+{
+        return bio_integrity_alloc_bioset(bio, gfp_mask, nr_vecs, fs_bio_set);
+}
+EXPORT_SYMBOL(bio_integrity_alloc);
+/**
+ * bio_integrity_free - Free bio integrity payload
+ * @bio:        bio containing bip to be freed
+ * @bs:         bio_set this bio was allocated from
+ *
+ * Description: Used to free the integrity portion of a bio. Usually
+ * called from bio_free().
+ */
+void bio_integrity_free(struct bio *bio, struct bio_set *bs)
+{
+        struct bio_integrity_payload *bip = bio->bi_integrity;
+        BUG_ON(bip == NULL);
+        /* A cloned bio doesn't own the integrity metadata */
+        if (!bio_flagged(bio, BIO_CLONED) && bip->bip_buf != NULL)
+                kfree(bip->bip_buf);
+        mempool_free(bip->bip_vec, bs->bvec_pools[bip->bip_pool]);
+        mempool_free(bip, bs->bio_integrity_pool);
+        bio->bi_integrity = NULL;
+}
+EXPORT_SYMBOL(bio_integrity_free);
+/**
+ * bio_integrity_add_page - Attach integrity metadata
+ * @bio:        bio to update
+ * @page:       page containing integrity metadata
+ * @len:        number of bytes of integrity metadata in page
+ * @offset:     start offset within page
+ *
+ * Description: Attach a page containing integrity metadata to bio.
+ */
+int bio_integrity_add_page(struct bio *bio, struct page *page,
+                           unsigned int len, unsigned int offset)
+{
+        struct bio_integrity_payload *bip = bio->bi_integrity;
+        struct bio_vec *iv;
+        if (bip->bip_vcnt >= bvec_nr_vecs(bip->bip_pool)) {
+                printk(KERN_ERR "%s: bip_vec full\n", __func__);
+                return 0;
+        }
+        iv = bip_vec_idx(bip, bip->bip_vcnt);
+        BUG_ON(iv == NULL);
+        BUG_ON(iv->bv_page != NULL);
+        iv->bv_page = page;
+        iv->bv_len = len;
+        iv->bv_offset = offset;
+        bip->bip_vcnt++;
+        return len;
+}
+EXPORT_SYMBOL(bio_integrity_add_page);
+/**
+ * bio_integrity_enabled - Check whether integrity can be passed
+ * @bio:        bio to check
+ *
+ * Description: Determines whether bio_integrity_prep() can be called
+ * on this bio or not.  bio data direction and target device must be
+ * set prior to calling.  The functions honors the write_generate and
+ * read_verify flags in sysfs.
+ */
+int bio_integrity_enabled(struct bio *bio)
+{
+        /* Already protected? */
+        if (bio_integrity(bio))
+                return 0;
+        return bdev_integrity_enabled(bio->bi_bdev, bio_data_dir(bio));
+}
+EXPORT_SYMBOL(bio_integrity_enabled);
+/**
+ * bio_integrity_hw_sectors - Convert 512b sectors to hardware ditto
+ * @bi:         blk_integrity profile for device
+ * @sectors:    Number of 512 sectors to convert
+ *
+ * Description: The block layer calculates everything in 512 byte
+ * sectors but integrity metadata is done in terms of the hardware
+ * sector size of the storage device.  Convert the block layer sectors
+ * to physical sectors.
+ */
+static inline unsigned int bio_integrity_hw_sectors(struct blk_integrity *bi,
+                                                    unsigned int sectors)
+{
+        /* At this point there are only 512b or 4096b DIF/EPP devices */
+        if (bi->sector_size == 4096)
+                return sectors >>= 3;
+        return sectors;
+}
+/**
+ * bio_integrity_tag_size - Retrieve integrity tag space
+ * @bio:        bio to inspect
+ *
+ * Description: Returns the maximum number of tag bytes that can be
+ * attached to this bio. Filesystems can use this to determine how
+ * much metadata to attach to an I/O.
+ */
+unsigned int bio_integrity_tag_size(struct bio *bio)
+{
+        struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+        BUG_ON(bio->bi_size == 0);
+        return bi->tag_size * (bio->bi_size / bi->sector_size);
+}
+EXPORT_SYMBOL(bio_integrity_tag_size);
+int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, int set)
+{
+        struct bio_integrity_payload *bip = bio->bi_integrity;
+        struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+        unsigned int nr_sectors;
+        BUG_ON(bip->bip_buf == NULL);
+        if (bi->tag_size == 0)
+                return -1;
+        nr_sectors = bio_integrity_hw_sectors(bi,
+                                        DIV_ROUND_UP(len, bi->tag_size));
+        if (nr_sectors * bi->tuple_size > bip->bip_size) {
+                printk(KERN_ERR "%s: tag too big for bio: %u > %u\n",
+                       __func__, nr_sectors * bi->tuple_size, bip->bip_size);
+                return -1;
+        }
+        if (set)
+                bi->set_tag_fn(bip->bip_buf, tag_buf, nr_sectors);
+        else
+                bi->get_tag_fn(bip->bip_buf, tag_buf, nr_sectors);
+        return 0;
+}
+/**
+ * bio_integrity_set_tag - Attach a tag buffer to a bio
+ * @bio:        bio to attach buffer to
+ * @tag_buf:    Pointer to a buffer containing tag data
+ * @len:        Length of the included buffer
+ *
+ * Description: Use this function to tag a bio by leveraging the extra
+ * space provided by devices formatted with integrity protection.  The
+ * size of the integrity buffer must be <= to the size reported by
+ * bio_integrity_tag_size().
+ */
+int bio_integrity_set_tag(struct bio *bio, void *tag_buf, unsigned int len)
+{
+        BUG_ON(bio_data_dir(bio) != WRITE);
+        return bio_integrity_tag(bio, tag_buf, len, 1);
+}
+EXPORT_SYMBOL(bio_integrity_set_tag);
+/**
+ * bio_integrity_get_tag - Retrieve a tag buffer from a bio
+ * @bio:        bio to retrieve buffer from
+ * @tag_buf:    Pointer to a buffer for the tag data
+ * @len:        Length of the target buffer
+ *
+ * Description: Use this function to retrieve the tag buffer from a
+ * completed I/O. The size of the integrity buffer must be <= to the
+ * size reported by bio_integrity_tag_size().
+ */
+int bio_integrity_get_tag(struct bio *bio, void *tag_buf, unsigned int len)
+{
+        BUG_ON(bio_data_dir(bio) != READ);
+        return bio_integrity_tag(bio, tag_buf, len, 0);
+}
+EXPORT_SYMBOL(bio_integrity_get_tag);
+/**
+ * bio_integrity_generate - Generate integrity metadata for a bio
+ * @bio:        bio to generate integrity metadata for
+ *
+ * Description: Generates integrity metadata for a bio by calling the
+ * block device's generation callback function.  The bio must have a
+ * bip attached with enough room to accommodate the generated
+ * integrity metadata.
+ */
+static void bio_integrity_generate(struct bio *bio)
+{
+        struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+        struct blk_integrity_exchg bix;
+        struct bio_vec *bv;
+        sector_t sector = bio->bi_sector;
+        unsigned int i, sectors, total;
+        void *prot_buf = bio->bi_integrity->bip_buf;
+        total = 0;
+        bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
+        bix.sector_size = bi->sector_size;
+        bio_for_each_segment(bv, bio, i) {
+                void *kaddr = kmap_atomic(bv->bv_page, KM_USER0);
+                bix.data_buf = kaddr + bv->bv_offset;
+                bix.data_size = bv->bv_len;
+                bix.prot_buf = prot_buf;
+                bix.sector = sector;
+                bi->generate_fn(&bix);
+                sectors = bv->bv_len / bi->sector_size;
+                sector += sectors;
+                prot_buf += sectors * bi->tuple_size;
+                total += sectors * bi->tuple_size;
+                BUG_ON(total > bio->bi_integrity->bip_size);
+                kunmap_atomic(kaddr, KM_USER0);
+        }
+}
+/**
+ * bio_integrity_prep - Prepare bio for integrity I/O
+ * @bio:        bio to prepare
+ *
+ * Description: Allocates a buffer for integrity metadata, maps the
+ * pages and attaches them to a bio.  The bio must have data
+ * direction, target device and start sector set priot to calling.  In
+ * the WRITE case, integrity metadata will be generated using the
+ * block device's integrity function.  In the READ case, the buffer
+ * will be prepared for DMA and a suitable end_io handler set up.
+ */
+int bio_integrity_prep(struct bio *bio)
+{
+        struct bio_integrity_payload *bip;
+        struct blk_integrity *bi;
+        struct request_queue *q;
+        void *buf;
+        unsigned long start, end;
+        unsigned int len, nr_pages;
+        unsigned int bytes, offset, i;
+        unsigned int sectors;
+        bi = bdev_get_integrity(bio->bi_bdev);
+        q = bdev_get_queue(bio->bi_bdev);
+        BUG_ON(bi == NULL);
+        BUG_ON(bio_integrity(bio));
+        sectors = bio_integrity_hw_sectors(bi, bio_sectors(bio));
+        /* Allocate kernel buffer for protection data */
+        len = sectors * blk_integrity_tuple_size(bi);
+        buf = kmalloc(len, GFP_NOIO | __GFP_NOFAIL | q->bounce_gfp);
+        if (unlikely(buf == NULL)) {
+                printk(KERN_ERR "could not allocate integrity buffer\n");
+                return -EIO;
+        }
+        end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+        start = ((unsigned long) buf) >> PAGE_SHIFT;
+        nr_pages = end - start;
+        /* Allocate bio integrity payload and integrity vectors */
+        bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages);
+        if (unlikely(bip == NULL)) {
+                printk(KERN_ERR "could not allocate data integrity bioset\n");
+                kfree(buf);
+                return -EIO;
+        }
+        bip->bip_buf = buf;
+        bip->bip_size = len;
+        bip->bip_sector = bio->bi_sector;
+        /* Map it */
+        offset = offset_in_page(buf);
+        for (i = 0 ; i < nr_pages ; i++) {
+                int ret;
+                bytes = PAGE_SIZE - offset;
+                if (len <= 0)
+                        break;
+                if (bytes > len)
+                        bytes = len;
+                ret = bio_integrity_add_page(bio, virt_to_page(buf),
+                                             bytes, offset);
+                if (ret == 0)
+                        return 0;
+                if (ret < bytes)
+                        break;
+                buf += bytes;
+                len -= bytes;
+                offset = 0;
+        }
+        /* Install custom I/O completion handler if read verify is enabled */
+        if (bio_data_dir(bio) == READ) {
+                bip->bip_end_io = bio->bi_end_io;
+                bio->bi_end_io = bio_integrity_endio;
+        }
+        /* Auto-generate integrity metadata if this is a write */
+        if (bio_data_dir(bio) == WRITE)
+                bio_integrity_generate(bio);
+        return 0;
+}
+EXPORT_SYMBOL(bio_integrity_prep);
+/**
+ * bio_integrity_verify - Verify integrity metadata for a bio
+ * @bio:        bio to verify
+ *
+ * Description: This function is called to verify the integrity of a
+ * bio.  The data in the bio io_vec is compared to the integrity
+ * metadata returned by the HBA.
+ */
+static int bio_integrity_verify(struct bio *bio)
+{
+        struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+        struct blk_integrity_exchg bix;
+        struct bio_vec *bv;
+        sector_t sector = bio->bi_integrity->bip_sector;
+        unsigned int i, sectors, total, ret;
+        void *prot_buf = bio->bi_integrity->bip_buf;
+        ret = total = 0;
+        bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
+        bix.sector_size = bi->sector_size;
+        bio_for_each_segment(bv, bio, i) {
+                void *kaddr = kmap_atomic(bv->bv_page, KM_USER0);
+                bix.data_buf = kaddr + bv->bv_offset;
+                bix.data_size = bv->bv_len;
+                bix.prot_buf = prot_buf;
+                bix.sector = sector;
+                ret = bi->verify_fn(&bix);
+                if (ret) {
+                        kunmap_atomic(kaddr, KM_USER0);
+                        break;
+                }
+                sectors = bv->bv_len / bi->sector_size;
+                sector += sectors;
+                prot_buf += sectors * bi->tuple_size;
+                total += sectors * bi->tuple_size;
+                BUG_ON(total > bio->bi_integrity->bip_size);
+                kunmap_atomic(kaddr, KM_USER0);
+        }
+        return ret;
+}
+/**
+ * bio_integrity_verify_fn - Integrity I/O completion worker
+ * @work:       Work struct stored in bio to be verified
+ *
+ * Description: This workqueue function is called to complete a READ
+ * request.  The function verifies the transferred integrity metadata
+ * and then calls the original bio end_io function.
+ */
+static void bio_integrity_verify_fn(struct work_struct *work)
+{
+        struct bio_integrity_payload *bip =
+                container_of(work, struct bio_integrity_payload, bip_work);
+        struct bio *bio = bip->bip_bio;
+        int error = bip->bip_error;
+        if (bio_integrity_verify(bio)) {
+                clear_bit(BIO_UPTODATE, &bio->bi_flags);
+                error = -EIO;
+        }
+        /* Restore original bio completion handler */
+        bio->bi_end_io = bip->bip_end_io;
+        if (bio->bi_end_io)
+                bio->bi_end_io(bio, error);
+}
+/**
+ * bio_integrity_endio - Integrity I/O completion function
+ * @bio:        Protected bio
+ * @error:      Pointer to errno
+ *
+ * Description: Completion for integrity I/O
+ *
+ * Normally I/O completion is done in interrupt context.  However,
+ * verifying I/O integrity is a time-consuming task which must be run
+ * in process context.  This function postpones completion
+ * accordingly.
+ */
+void bio_integrity_endio(struct bio *bio, int error)
+{
+        struct bio_integrity_payload *bip = bio->bi_integrity;
+        BUG_ON(bip->bip_bio != bio);
+        bip->bip_error = error;
+        INIT_WORK(&bip->bip_work, bio_integrity_verify_fn);
+        queue_work(kintegrityd_wq, &bip->bip_work);
+}
+EXPORT_SYMBOL(bio_integrity_endio);
+/**
+ * bio_integrity_mark_head - Advance bip_vec skip bytes
+ * @bip:        Integrity vector to advance
+ * @skip:       Number of bytes to advance it
+ */
+void bio_integrity_mark_head(struct bio_integrity_payload *bip,
+                             unsigned int skip)
+{
+        struct bio_vec *iv;
+        unsigned int i;
+        bip_for_each_vec(iv, bip, i) {
+                if (skip == 0) {
+                        bip->bip_idx = i;
+                        return;
+                } else if (skip >= iv->bv_len) {
+                        skip -= iv->bv_len;
+                } else { /* skip < iv->bv_len) */
+                        iv->bv_offset += skip;
+                        iv->bv_len -= skip;
+                        bip->bip_idx = i;
+                        return;
+                }
+        }
+}
+/**
+ * bio_integrity_mark_tail - Truncate bip_vec to be len bytes long
+ * @bip:        Integrity vector to truncate
+ * @len:        New length of integrity vector
+ */
+void bio_integrity_mark_tail(struct bio_integrity_payload *bip,
+                             unsigned int len)
+{
+        struct bio_vec *iv;
+        unsigned int i;
+        bip_for_each_vec(iv, bip, i) {
+                if (len == 0) {
+                        bip->bip_vcnt = i;
+                        return;
+                } else if (len >= iv->bv_len) {
+                        len -= iv->bv_len;
+                } else { /* len < iv->bv_len) */
+                        iv->bv_len = len;
+                        len = 0;
+                }
+        }
+}
+/**
+ * bio_integrity_advance - Advance integrity vector
+ * @bio:        bio whose integrity vector to update
+ * @bytes_done: number of data bytes that have been completed
+ *
+ * Description: This function calculates how many integrity bytes the
+ * number of completed data bytes correspond to and advances the
+ * integrity vector accordingly.
+ */
+void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
+{
+        struct bio_integrity_payload *bip = bio->bi_integrity;
+        struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+        unsigned int nr_sectors;
+        BUG_ON(bip == NULL);
+        BUG_ON(bi == NULL);
+        nr_sectors = bio_integrity_hw_sectors(bi, bytes_done >> 9);
+        bio_integrity_mark_head(bip, nr_sectors * bi->tuple_size);
+}
+EXPORT_SYMBOL(bio_integrity_advance);
+/**
+ * bio_integrity_trim - Trim integrity vector
+ * @bio:        bio whose integrity vector to update
+ * @offset:     offset to first data sector
+ * @sectors:    number of data sectors
+ *
+ * Description: Used to trim the integrity vector in a cloned bio.
+ * The ivec will be advanced corresponding to 'offset' data sectors
+ * and the length will be truncated corresponding to 'len' data
+ * sectors.
+ */
+void bio_integrity_trim(struct bio *bio, unsigned int offset,
+                        unsigned int sectors)
+{
+        struct bio_integrity_payload *bip = bio->bi_integrity;
+        struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+        unsigned int nr_sectors;
+        BUG_ON(bip == NULL);
+        BUG_ON(bi == NULL);
+        BUG_ON(!bio_flagged(bio, BIO_CLONED));
+        nr_sectors = bio_integrity_hw_sectors(bi, sectors);
+        bip->bip_sector = bip->bip_sector + offset;
+        bio_integrity_mark_head(bip, offset * bi->tuple_size);
+        bio_integrity_mark_tail(bip, sectors * bi->tuple_size);
+}
+EXPORT_SYMBOL(bio_integrity_trim);
+/**
+ * bio_integrity_split - Split integrity metadata
+ * @bio:        Protected bio
+ * @bp:         Resulting bio_pair
+ * @sectors:    Offset
+ *
+ * Description: Splits an integrity page into a bio_pair.
+ */
+void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors)
+{
+        struct blk_integrity *bi;
+        struct bio_integrity_payload *bip = bio->bi_integrity;
+        unsigned int nr_sectors;
+        if (bio_integrity(bio) == 0)
+                return;
+        bi = bdev_get_integrity(bio->bi_bdev);
+        BUG_ON(bi == NULL);
+        BUG_ON(bip->bip_vcnt != 1);
+        nr_sectors = bio_integrity_hw_sectors(bi, sectors);
+        bp->bio1.bi_integrity = &bp->bip1;
+        bp->bio2.bi_integrity = &bp->bip2;
+        bp->iv1 = bip->bip_vec[0];
+        bp->iv2 = bip->bip_vec[0];
+        bp->bip1.bip_vec = &bp->iv1;
+        bp->bip2.bip_vec = &bp->iv2;
+        bp->iv1.bv_len = sectors * bi->tuple_size;
+        bp->iv2.bv_offset += sectors * bi->tuple_size;
+        bp->iv2.bv_len -= sectors * bi->tuple_size;
+        bp->bip1.bip_sector = bio->bi_integrity->bip_sector;
+        bp->bip2.bip_sector = bio->bi_integrity->bip_sector + nr_sectors;
+        bp->bip1.bip_vcnt = bp->bip2.bip_vcnt = 1;
+        bp->bip1.bip_idx = bp->bip2.bip_idx = 0;
+}
+EXPORT_SYMBOL(bio_integrity_split);
+/**
+ * bio_integrity_clone - Callback for cloning bios with integrity metadata
+ * @bio:        New bio
+ * @bio_src:    Original bio
+ * @bs:         bio_set to allocate bip from
+ *
+ * Description: Called to allocate a bip when cloning a bio
+ */
+int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
+                        struct bio_set *bs)
+{
+        struct bio_integrity_payload *bip_src = bio_src->bi_integrity;
+        struct bio_integrity_payload *bip;
+        BUG_ON(bip_src == NULL);
+        bip = bio_integrity_alloc_bioset(bio, GFP_NOIO, bip_src->bip_vcnt, bs);
+        if (bip == NULL)
+                return -EIO;
+        memcpy(bip->bip_vec, bip_src->bip_vec,
+               bip_src->bip_vcnt * sizeof(struct bio_vec));
+        bip->bip_sector = bip_src->bip_sector;
+        bip->bip_vcnt = bip_src->bip_vcnt;
+        bip->bip_idx = bip_src->bip_idx;
+        return 0;
+}
+EXPORT_SYMBOL(bio_integrity_clone);
+int bioset_integrity_create(struct bio_set *bs, int pool_size)
+{
+        bs->bio_integrity_pool = mempool_create_slab_pool(pool_size,
+                                                          bio_integrity_slab);
+        if (!bs->bio_integrity_pool)
+                return -1;
+        return 0;
+}
+EXPORT_SYMBOL(bioset_integrity_create);
+void bioset_integrity_free(struct bio_set *bs)
+{
+        if (bs->bio_integrity_pool)
+                mempool_destroy(bs->bio_integrity_pool);
+}
+EXPORT_SYMBOL(bioset_integrity_free);
+void __init bio_integrity_init_slab(void)
+{
+        bio_integrity_slab = KMEM_CACHE(bio_integrity_payload,
+                                        SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+}
+EXPORT_SYMBOL(bio_integrity_init_slab);
+static int __init integrity_init(void)
+{
+        kintegrityd_wq = create_workqueue("kintegrityd");
+        if (!kintegrityd_wq)
+                panic("Failed to create kintegrityd\n");
+        return 0;
+}
+subsys_initcall(integrity_init);
diff --git a/fs/bio.c b/fs/bio.c
index 78562574cb52..88322b066acb 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -28,25 +28,10 @@
 #include <linux/blktrace_api.h>
 #include <scsi/sg.h>            /* for struct sg_iovec */
-#define BIO_POOL_SIZE 2
 static struct kmem_cache *bio_slab __read_mostly;
-#define BIOVEC_NR_POOLS 6
-/*
- * a small number of entries is fine, not going to be performance critical.
- * basically we just need to survive
- */
-#define BIO_SPLIT_ENTRIES 2
 mempool_t *bio_split_pool __read_mostly;
-struct biovec_slab {
-        int nr_vecs;
-        char *name; 
-        struct kmem_cache *slab;
-};
 /*
 * if you change this list, also change bvec_alloc or things will
 * break badly! cannot be bigger than what you can fit into an
@@ -60,23 +45,17 @@ static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
 #undef BV
 /*
- * bio_set is used to allow other portions of the IO system to
- * allocate their own private memory pools for bio and iovec structures.
- * These memory pools in turn all allocate from the bio_slab
- * and the bvec_slabs[].
- */
-struct bio_set {
-        mempool_t *bio_pool;
-        mempool_t *bvec_pools[BIOVEC_NR_POOLS];
-};
-/*
 * fs_bio_set is the bio_set containing bio and iovec memory pools used by
 * IO code that does not need private memory pools.
 */
-static struct bio_set *fs_bio_set;
+struct bio_set *fs_bio_set;
+unsigned int bvec_nr_vecs(unsigned short idx)
+{
+        return bvec_slabs[idx].nr_vecs;
+}
-static inline struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs)
+struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs)
 {
        struct bio_vec *bvl;
@@ -117,6 +96,9 @@ void bio_free(struct bio *bio, struct bio_set *bio_set)
                mempool_free(bio->bi_io_vec, bio_set->bvec_pools[pool_idx]);
        }
+        if (bio_integrity(bio))
+                bio_integrity_free(bio, bio_set);
        mempool_free(bio, bio_set->bio_pool);
 }
@@ -275,9 +257,19 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
 {
        struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set);
-        if (b) {
+        if (!b)
-                b->bi_destructor = bio_fs_destructor;
+                return NULL;
-                __bio_clone(b, bio);
+        b->bi_destructor = bio_fs_destructor;
+        __bio_clone(b, bio);
+        if (bio_integrity(bio)) {
+                int ret;
+                ret = bio_integrity_clone(b, bio, fs_bio_set);
+                if (ret < 0)
+                        return NULL;
        }
        return b;
@@ -333,10 +325,19 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
                if (page == prev->bv_page &&
                    offset == prev->bv_offset + prev->bv_len) {
                        prev->bv_len += len;
-                        if (q->merge_bvec_fn &&
-                            q->merge_bvec_fn(q, bio, prev) < len) {
+                        if (q->merge_bvec_fn) {
-                                prev->bv_len -= len;
+                                struct bvec_merge_data bvm = {
-                                return 0;
+                                        .bi_bdev = bio->bi_bdev,
+                                        .bi_sector = bio->bi_sector,
+                                        .bi_size = bio->bi_size,
+                                        .bi_rw = bio->bi_rw,
+                                };
+                                if (q->merge_bvec_fn(q, &bvm, prev) < len) {
+                                        prev->bv_len -= len;
+                                        return 0;
+                                }
                        }
                        goto done;
@@ -377,11 +378,18 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
         * queue to get further control
         */
        if (q->merge_bvec_fn) {
+                struct bvec_merge_data bvm = {
+                        .bi_bdev = bio->bi_bdev,
+                        .bi_sector = bio->bi_sector,
+                        .bi_size = bio->bi_size,
+                        .bi_rw = bio->bi_rw,
+                };
                /*
                 * merge_bvec_fn() returns number of bytes it can accept
                 * at this offset
                 */
-                if (q->merge_bvec_fn(q, bio, bvec) < len) {
+                if (q->merge_bvec_fn(q, &bvm, bvec) < len) {
                        bvec->bv_page = NULL;
                        bvec->bv_len = 0;
                        bvec->bv_offset = 0;
@@ -1249,6 +1257,9 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
        bp->bio1.bi_private = bi;
        bp->bio2.bi_private = pool;
+        if (bio_integrity(bi))
+                bio_integrity_split(bi, bp, first_sectors);
        return bp;
 }
@@ -1290,6 +1301,7 @@ void bioset_free(struct bio_set *bs)
        if (bs->bio_pool)
                mempool_destroy(bs->bio_pool);
+        bioset_integrity_free(bs);
        biovec_free_pools(bs);
        kfree(bs);
@@ -1306,6 +1318,9 @@ struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size)
        if (!bs->bio_pool)
                goto bad;
+        if (bioset_integrity_create(bs, bio_pool_size))
+                goto bad;
        if (!biovec_create_pools(bs, bvec_pool_size))
                return bs;
@@ -1332,6 +1347,7 @@ static int __init init_bio(void)
 {
        bio_slab = KMEM_CACHE(bio, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+        bio_integrity_init_slab();
        biovec_init_slabs();
        fs_bio_set = bioset_create(BIO_POOL_SIZE, 2);
diff --git a/fs/buffer.c b/fs/buffer.c
index 0f51c0f7c266..d48caee12e2a 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1464,7 +1464,7 @@ static void invalidate_bh_lru(void *arg)
        
 void invalidate_bh_lrus(void)
 {
-        on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
+        on_each_cpu(invalidate_bh_lru, NULL, 1);
 }
 EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
@@ -1691,11 +1691,13 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
                         */
                        clear_buffer_dirty(bh);
                        set_buffer_uptodate(bh);
-                } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
+                } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
+                           buffer_dirty(bh)) {
                        WARN_ON(bh->b_size != blocksize);
                        err = get_block(inode, block, bh, 1);
                        if (err)
                                goto recover;
+                        clear_buffer_delay(bh);
                        if (buffer_new(bh)) {
                                /* blockdev mappings never come here */
                                clear_buffer_new(bh);
@@ -1774,7 +1776,8 @@ recover:
        bh = head;
        /* Recovery: lock and submit the mapped buffers */
        do {
-                if (buffer_mapped(bh) && buffer_dirty(bh)) {
+                if (buffer_mapped(bh) && buffer_dirty(bh) &&
+                    !buffer_delay(bh)) {
                        lock_buffer(bh);
                        mark_buffer_async_write(bh);
                } else {
@@ -2061,6 +2064,7 @@ int generic_write_end(struct file *file, struct address_space *mapping,
                        struct page *page, void *fsdata)
 {
        struct inode *inode = mapping->host;
+        int i_size_changed = 0;
        copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
@@ -2073,12 +2077,21 @@ int generic_write_end(struct file *file, struct address_space *mapping,
         */
        if (pos+copied > inode->i_size) {
                i_size_write(inode, pos+copied);
-                mark_inode_dirty(inode);
+                i_size_changed = 1;
        }
        unlock_page(page);
        page_cache_release(page);
+        /*
+         * Don't mark the inode dirty under page lock. First, it unnecessarily
+         * makes the holding time of page lock longer. Second, it forces lock
+         * ordering of page lock and transaction start for journaling
+         * filesystems.
+         */
+        if (i_size_changed)
+                mark_inode_dirty(inode);
        return copied;
 }
 EXPORT_SYMBOL(generic_write_end);
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 68e510b88457..3cb7cda3d780 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -373,6 +373,8 @@ static int chrdev_open(struct inode *inode, struct file *filp)
                        return -ENXIO;
                new = container_of(kobj, struct cdev, kobj);
                spin_lock(&cdev_lock);
+                /* Check i_cdev again in case somebody beat us to it while
+                   we dropped the lock. */
                p = inode->i_cdev;
                if (!p) {
                        inode->i_cdev = p = new;
@@ -392,11 +394,8 @@ static int chrdev_open(struct inode *inode, struct file *filp)
                cdev_put(p);
                return -ENXIO;
        }
-        if (filp->f_op->open) {
+        if (filp->f_op->open)
-                lock_kernel();
                ret = filp->f_op->open(inode,filp);
-                unlock_kernel();
-        }
        if (ret)
                cdev_put(p);
        return ret;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 86b4d5f405ae..22857c639df5 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -612,7 +612,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
                if (retval < 0)
                        return (loff_t)retval;
        }
-        return remote_llseek(file, offset, origin);
+        return generic_file_llseek_unlocked(file, offset, origin);
 }
 struct file_system_type cifs_fs_type = {
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 97dba0d92348..c54eaab71a19 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -69,9 +69,11 @@
 #include <linux/capi.h>
 #include <linux/gigaset_dev.h>
+#ifdef CONFIG_BLOCK
 #include <scsi/scsi.h>
 #include <scsi/scsi_ioctl.h>
 #include <scsi/sg.h>
+#endif
 #include <asm/uaccess.h>
 #include <linux/ethtool.h>
@@ -2024,6 +2026,7 @@ COMPATIBLE_IOCTL(GIO_UNISCRNMAP)
 COMPATIBLE_IOCTL(PIO_UNISCRNMAP)
 COMPATIBLE_IOCTL(PIO_FONTRESET)
 COMPATIBLE_IOCTL(PIO_UNIMAPCLR)
+#ifdef CONFIG_BLOCK
 /* Big S */
 COMPATIBLE_IOCTL(SCSI_IOCTL_GET_IDLUN)
 COMPATIBLE_IOCTL(SCSI_IOCTL_DOORLOCK)
@@ -2033,6 +2036,7 @@ COMPATIBLE_IOCTL(SCSI_IOCTL_GET_BUS_NUMBER)
 COMPATIBLE_IOCTL(SCSI_IOCTL_SEND_COMMAND)
 COMPATIBLE_IOCTL(SCSI_IOCTL_PROBE_HOST)
 COMPATIBLE_IOCTL(SCSI_IOCTL_GET_PCI)
+#endif
 /* Big T */
 COMPATIBLE_IOCTL(TUNSETNOCSUM)
 COMPATIBLE_IOCTL(TUNSETDEBUG)
@@ -2103,6 +2107,7 @@ COMPATIBLE_IOCTL(SIOCGIFVLAN)
 COMPATIBLE_IOCTL(SIOCSIFVLAN)
 COMPATIBLE_IOCTL(SIOCBRADDBR)
 COMPATIBLE_IOCTL(SIOCBRDELBR)
+#ifdef CONFIG_BLOCK
 /* SG stuff */
 COMPATIBLE_IOCTL(SG_SET_TIMEOUT)
 COMPATIBLE_IOCTL(SG_GET_TIMEOUT)
@@ -2127,6 +2132,7 @@ COMPATIBLE_IOCTL(SG_SCSI_RESET)
 COMPATIBLE_IOCTL(SG_GET_REQUEST_TABLE)
 COMPATIBLE_IOCTL(SG_SET_KEEP_ORPHAN)
 COMPATIBLE_IOCTL(SG_GET_KEEP_ORPHAN)
+#endif
 /* PPP stuff */
 COMPATIBLE_IOCTL(PPPIOCGFLAGS)
 COMPATIBLE_IOCTL(PPPIOCSFLAGS)
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index ebbcf38fd33b..f976f303c196 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -15,6 +15,7 @@
 #include <linux/poll.h>
 #include <linux/signal.h>
 #include <linux/spinlock.h>
+#include <linux/smp_lock.h>
 #include <linux/dlm.h>
 #include <linux/dlm_device.h>
@@ -618,13 +619,17 @@ static int device_open(struct inode *inode, struct file *file)
        struct dlm_user_proc *proc;
        struct dlm_ls *ls;
+        lock_kernel();
        ls = dlm_find_lockspace_device(iminor(inode));
-        if (!ls)
+        if (!ls) {
+                unlock_kernel();
                return -ENOENT;
+        }
        proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL);
        if (!proc) {
                dlm_put_lockspace(ls);
+                unlock_kernel();
                return -ENOMEM;
        }
@@ -636,6 +641,7 @@ static int device_open(struct inode *inode, struct file *file)
        spin_lock_init(&proc->locks_spin);
        init_waitqueue_head(&proc->wait);
        file->private_data = proc;
+        unlock_kernel();
        return 0;
 }
@@ -870,6 +876,7 @@ static unsigned int device_poll(struct file *file, poll_table *wait)
 static int ctl_device_open(struct inode *inode, struct file *file)
 {
+        cycle_kernel_lock();
        file->private_data = NULL;
        return 0;
 }
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 2258b8f654a6..24749bf0668f 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -30,6 +30,7 @@
 #include <linux/security.h>
 #include <linux/compat.h>
 #include <linux/fs_stack.h>
+#include <linux/smp_lock.h>
 #include "ecryptfs_kernel.h"
 /**
@@ -277,9 +278,11 @@ static int ecryptfs_fasync(int fd, struct file *file, int flag)
        int rc = 0;
        struct file *lower_file = NULL;
+        lock_kernel();
        lower_file = ecryptfs_file_to_lower(file);
        if (lower_file->f_op && lower_file->f_op->fasync)
                rc = lower_file->f_op->fasync(fd, lower_file, flag);
+        unlock_kernel();
        return rc;
 }
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 9cc80b9cc8d8..495ab21b9832 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -47,7 +47,7 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
                        ext4_group_t block_group)
 {
        ext4_group_t actual_group;
-        ext4_get_group_no_and_offset(sb, block, &actual_group, 0);
+        ext4_get_group_no_and_offset(sb, block, &actual_group, NULL);
        if (actual_group == block_group)
                return 1;
        return 0;
@@ -121,12 +121,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
                                le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
                }
        } else { /* For META_BG_BLOCK_GROUPS */
-                int group_rel = (block_group -
+                bit_max += ext4_bg_num_gdb(sb, block_group);
-                                 le32_to_cpu(sbi->s_es->s_first_meta_bg)) %
-                                EXT4_DESC_PER_BLOCK(sb);
-                if (group_rel == 0 || group_rel == 1 ||
-                    (group_rel == EXT4_DESC_PER_BLOCK(sb) - 1))
-                        bit_max += 1;
        }
        if (block_group == sbi->s_groups_count - 1) {
@@ -295,7 +290,7 @@ err_out:
        return 0;
 }
 /**
- * read_block_bitmap()
+ * ext4_read_block_bitmap()
 * @sb:                 super block
 * @block_group:        given block group
 *
@@ -305,7 +300,7 @@ err_out:
 * Return buffer_head on success or NULL in case of failure.
 */
 struct buffer_head *
-read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
+ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 {
        struct ext4_group_desc * desc;
        struct buffer_head * bh = NULL;
@@ -409,8 +404,7 @@ restart:
                prev = rsv;
        }
        printk("Window map complete.\n");
-        if (bad)
+        BUG_ON(bad);
-                BUG();
 }
 #define rsv_window_dump(root, verbose) \
        __rsv_window_dump((root), (verbose), __func__)
@@ -694,7 +688,7 @@ do_more:
                count -= overflow;
        }
        brelse(bitmap_bh);
-        bitmap_bh = read_block_bitmap(sb, block_group);
+        bitmap_bh = ext4_read_block_bitmap(sb, block_group);
        if (!bitmap_bh)
                goto error_return;
        desc = ext4_get_group_desc (sb, block_group, &gd_bh);
@@ -810,6 +804,13 @@ do_more:
        spin_unlock(sb_bgl_lock(sbi, block_group));
        percpu_counter_add(&sbi->s_freeblocks_counter, count);
+        if (sbi->s_log_groups_per_flex) {
+                ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+                spin_lock(sb_bgl_lock(sbi, flex_group));
+                sbi->s_flex_groups[flex_group].free_blocks += count;
+                spin_unlock(sb_bgl_lock(sbi, flex_group));
+        }
        /* We dirtied the bitmap block */
        BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
        err = ext4_journal_dirty_metadata(handle, bitmap_bh);
@@ -1598,23 +1599,35 @@ out:
 /**
 * ext4_has_free_blocks()
- * @sbi:                in-core super block structure.
+ * @sbi:        in-core super block structure.
+ * @nblocks:    number of neeed blocks
 *
- * Check if filesystem has at least 1 free block available for allocation.
+ * Check if filesystem has free blocks available for allocation.
+ * Return the number of blocks avaible for allocation for this request
+ * On success, return nblocks
 */
-static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
+ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
+                                                ext4_fsblk_t nblocks)
 {
-        ext4_fsblk_t free_blocks, root_blocks;
+        ext4_fsblk_t free_blocks;
+        ext4_fsblk_t root_blocks = 0;
        free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
-        root_blocks = ext4_r_blocks_count(sbi->s_es);
-        if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
+        if (!capable(CAP_SYS_RESOURCE) &&
                sbi->s_resuid != current->fsuid &&
-                (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
+                (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
-                return 0;
+                root_blocks = ext4_r_blocks_count(sbi->s_es);
-        }
+#ifdef CONFIG_SMP
-        return 1;
+        if (free_blocks - root_blocks < FBC_BATCH)
-}
+                free_blocks =
+                        percpu_counter_sum_and_set(&sbi->s_freeblocks_counter);
+#endif
+        if (free_blocks - root_blocks < nblocks)
+                return free_blocks - root_blocks;
+        return nblocks;
+ }
 /**
 * ext4_should_retry_alloc()
@@ -1630,7 +1643,7 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
 */
 int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 {
-        if (!ext4_has_free_blocks(EXT4_SB(sb)) || (*retries)++ > 3)
+        if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || (*retries)++ > 3)
                return 0;
        jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
@@ -1639,20 +1652,24 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 }
 /**
- * ext4_new_blocks_old() -- core block(s) allocation function
+ * ext4_old_new_blocks() -- core block bitmap based block allocation function
+ *
 * @handle:             handle to this transaction
 * @inode:              file inode
 * @goal:               given target block(filesystem wide)
 * @count:              target number of blocks to allocate
 * @errp:               error code
 *
- * ext4_new_blocks uses a goal block to assist allocation.  It tries to
+ * ext4_old_new_blocks uses a goal block to assist allocation and look up
- * allocate block(s) from the block group contains the goal block first. If that
+ * the block bitmap directly to do block allocation.  It tries to
- * fails, it will try to allocate block(s) from other block groups without
+ * allocate block(s) from the block group contains the goal block first. If
- * any specific goal block.
+ * that fails, it will try to allocate block(s) from other block groups
+ * without any specific goal block.
+ *
+ * This function is called when -o nomballoc mount option is enabled
 *
 */
-ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
+ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
                        ext4_fsblk_t goal, unsigned long *count, int *errp)
 {
        struct buffer_head *bitmap_bh = NULL;
@@ -1676,13 +1693,26 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
        ext4_group_t ngroups;
        unsigned long num = *count;
-        *errp = -ENOSPC;
        sb = inode->i_sb;
        if (!sb) {
+                *errp = -ENODEV;
                printk("ext4_new_block: nonexistent device");
                return 0;
        }
+        sbi = EXT4_SB(sb);
+        if (!EXT4_I(inode)->i_delalloc_reserved_flag) {
+                /*
+                 * With delalloc we already reserved the blocks
+                 */
+                *count = ext4_has_free_blocks(sbi, *count);
+        }
+        if (*count == 0) {
+                *errp = -ENOSPC;
+                return 0;       /*return with ENOSPC error */
+        }
+        num = *count;
        /*
         * Check quota for allocation of this block.
         */
@@ -1706,11 +1736,6 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
        if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
                my_rsv = &block_i->rsv_window_node;
-        if (!ext4_has_free_blocks(sbi)) {
-                *errp = -ENOSPC;
-                goto out;
-        }
        /*
         * First, test whether the goal block is free.
         */
@@ -1734,7 +1759,7 @@ retry_alloc:
                my_rsv = NULL;
        if (free_blocks > 0) {
-                bitmap_bh = read_block_bitmap(sb, group_no);
+                bitmap_bh = ext4_read_block_bitmap(sb, group_no);
                if (!bitmap_bh)
                        goto io_error;
                grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
@@ -1770,7 +1795,7 @@ retry_alloc:
                        continue;
                brelse(bitmap_bh);
-                bitmap_bh = read_block_bitmap(sb, group_no);
+                bitmap_bh = ext4_read_block_bitmap(sb, group_no);
                if (!bitmap_bh)
                        goto io_error;
                /*
@@ -1882,7 +1907,15 @@ allocated:
        le16_add_cpu(&gdp->bg_free_blocks_count, -num);
        gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
        spin_unlock(sb_bgl_lock(sbi, group_no));
-        percpu_counter_sub(&sbi->s_freeblocks_counter, num);
+        if (!EXT4_I(inode)->i_delalloc_reserved_flag)
+                percpu_counter_sub(&sbi->s_freeblocks_counter, num);
+        if (sbi->s_log_groups_per_flex) {
+                ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
+                spin_lock(sb_bgl_lock(sbi, flex_group));
+                sbi->s_flex_groups[flex_group].free_blocks -= num;
+                spin_unlock(sb_bgl_lock(sbi, flex_group));
+        }
        BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
        err = ext4_journal_dirty_metadata(handle, gdp_bh);
@@ -1915,46 +1948,104 @@ out:
        return 0;
 }
-ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode,
+#define EXT4_META_BLOCK 0x1
-                ext4_fsblk_t goal, int *errp)
+static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
+                                ext4_lblk_t iblock, ext4_fsblk_t goal,
+                                unsigned long *count, int *errp, int flags)
 {
        struct ext4_allocation_request ar;
        ext4_fsblk_t ret;
        if (!test_opt(inode->i_sb, MBALLOC)) {
-                unsigned long count = 1;
+                return ext4_old_new_blocks(handle, inode, goal, count, errp);
-                ret = ext4_new_blocks_old(handle, inode, goal, &count, errp);
-                return ret;
        }
        memset(&ar, 0, sizeof(ar));
+        /* Fill with neighbour allocated blocks */
        ar.inode = inode;
        ar.goal = goal;
-        ar.len = 1;
+        ar.len = *count;
+        ar.logical = iblock;
+        if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK))
+                /* enable in-core preallocation for data block allocation */
+                ar.flags = EXT4_MB_HINT_DATA;
+        else
+                /* disable in-core preallocation for non-regular files */
+                ar.flags = 0;
        ret = ext4_mb_new_blocks(handle, &ar, errp);
+        *count = ar.len;
        return ret;
 }
-ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+/*
+ * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
+ *
+ * @handle:             handle to this transaction
+ * @inode:              file inode
+ * @goal:               given target block(filesystem wide)
+ * @count:              total number of blocks need
+ * @errp:               error code
+ *
+ * Return 1st allocated block numberon success, *count stores total account
+ * error stores in errp pointer
+ */
+ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
                ext4_fsblk_t goal, unsigned long *count, int *errp)
 {
-        struct ext4_allocation_request ar;
        ext4_fsblk_t ret;
+        ret = do_blk_alloc(handle, inode, 0, goal,
-        if (!test_opt(inode->i_sb, MBALLOC)) {
+                                count, errp, EXT4_META_BLOCK);
-                ret = ext4_new_blocks_old(handle, inode, goal, count, errp);
+        /*
-                return ret;
+         * Account for the allocated meta blocks
+         */
+        if (!(*errp)) {
+                spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+                EXT4_I(inode)->i_allocated_meta_blocks += *count;
+                spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
        }
-        memset(&ar, 0, sizeof(ar));
-        ar.inode = inode;
-        ar.goal = goal;
-        ar.len = *count;
-        ret = ext4_mb_new_blocks(handle, &ar, errp);
-        *count = ar.len;
        return ret;
 }
+/*
+ * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks
+ *
+ * @handle:             handle to this transaction
+ * @inode:              file inode
+ * @goal:               given target block(filesystem wide)
+ * @errp:               error code
+ *
+ * Return allocated block number on success
+ */
+ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
+                ext4_fsblk_t goal, int *errp)
+{
+        unsigned long count = 1;
+        return ext4_new_meta_blocks(handle, inode, goal, &count, errp);
+}
+/*
+ * ext4_new_blocks() -- allocate data blocks
+ *
+ * @handle:             handle to this transaction
+ * @inode:              file inode
+ * @goal:               given target block(filesystem wide)
+ * @count:              total number of blocks need
+ * @errp:               error code
+ *
+ * Return 1st allocated block numberon success, *count stores total account
+ * error stores in errp pointer
+ */
+ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+                                ext4_lblk_t iblock, ext4_fsblk_t goal,
+                                unsigned long *count, int *errp)
+{
+        return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0);
+}
 /**
 * ext4_count_free_blocks() -- count filesystem free blocks
@@ -1986,7 +2077,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
                        continue;
                desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
                brelse(bitmap_bh);
-                bitmap_bh = read_block_bitmap(sb, i);
+                bitmap_bh = ext4_read_block_bitmap(sb, i);
                if (bitmap_bh == NULL)
                        continue;
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 2bf0331ea194..d3d23d73c08b 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -129,7 +129,8 @@ static int ext4_readdir(struct file * filp,
                struct buffer_head *bh = NULL;
                map_bh.b_state = 0;
-                err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, 0, 0);
+                err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh,
+                                                0, 0, 0);
                if (err > 0) {
                        pgoff_t index = map_bh.b_blocknr >>
                                        (PAGE_CACHE_SHIFT - inode->i_blkbits);
@@ -272,7 +273,7 @@ static void free_rb_tree_fname(struct rb_root *root)
        while (n) {
                /* Do the node's children first */
-                if ((n)->rb_left) {
+                if (n->rb_left) {
                        n = n->rb_left;
                        continue;
                }
@@ -301,24 +302,18 @@ static void free_rb_tree_fname(struct rb_root *root)
                        parent->rb_right = NULL;
                n = parent;
        }
-        root->rb_node = NULL;
 }
-static struct dir_private_info *create_dir_info(loff_t pos)
+static struct dir_private_info *ext4_htree_create_dir_info(loff_t pos)
 {
        struct dir_private_info *p;
-        p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL);
+        p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
        if (!p)
                return NULL;
-        p->root.rb_node = NULL;
-        p->curr_node = NULL;
-        p->extra_fname = NULL;
-        p->last_pos = 0;
        p->curr_hash = pos2maj_hash(pos);
        p->curr_minor_hash = pos2min_hash(pos);
-        p->next_hash = 0;
        return p;
 }
@@ -433,7 +428,7 @@ static int ext4_dx_readdir(struct file * filp,
        int     ret;
        if (!info) {
-                info = create_dir_info(filp->f_pos);
+                info = ext4_htree_create_dir_info(filp->f_pos);
                if (!info)
                        return -ENOMEM;
                filp->private_data = info;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8158083f7ac0..303e41cf7b14 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -22,7 +22,7 @@
 #include "ext4_i.h"
 /*
- * The second extended filesystem constants/structures
+ * The fourth extended filesystem constants/structures
 */
 /*
@@ -45,7 +45,7 @@
 #define ext4_debug(f, a...)                                             \
        do {                                                            \
                printk (KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:",       \
-                        __FILE__, __LINE__, __FUNCTION__);              \
+                        __FILE__, __LINE__, __func__);                  \
                printk (KERN_DEBUG f, ## a);                            \
        } while (0)
 #else
@@ -74,6 +74,9 @@
 #define EXT4_MB_HINT_GOAL_ONLY          256
 /* goal is meaningful */
 #define EXT4_MB_HINT_TRY_GOAL           512
+/* blocks already pre-reserved by delayed allocation */
+#define EXT4_MB_DELALLOC_RESERVED      1024
 struct ext4_allocation_request {
        /* target inode for block we're allocating */
@@ -170,6 +173,15 @@ struct ext4_group_desc
        __u32   bg_reserved2[3];
 };
+/*
+ * Structure of a flex block group info
+ */
+struct flex_groups {
+        __u32 free_inodes;
+        __u32 free_blocks;
+};
 #define EXT4_BG_INODE_UNINIT    0x0001 /* Inode table/bitmap not in use */
 #define EXT4_BG_BLOCK_UNINIT    0x0002 /* Block bitmap not in use */
 #define EXT4_BG_INODE_ZEROED    0x0004 /* On-disk itable initialized to zero */
@@ -527,6 +539,7 @@ do {									       \
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
 #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
 #define EXT4_MOUNT_MBALLOC              0x4000000 /* Buddy allocation support */
+#define EXT4_MOUNT_DELALLOC             0x8000000 /* Delalloc support */
 /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
 #define clear_opt(o, opt)               o &= ~EXT4_MOUNT_##opt
@@ -647,7 +660,10 @@ struct ext4_super_block {
        __le16  s_mmp_interval;         /* # seconds to wait in MMP checking */
        __le64  s_mmp_block;            /* Block for multi-mount protection */
        __le32  s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
-        __u32   s_reserved[163];        /* Padding to the end of the block */
+        __u8    s_log_groups_per_flex;  /* FLEX_BG group size */
+        __u8    s_reserved_char_pad2;
+        __le16  s_reserved_pad;
+        __u32   s_reserved[162];        /* Padding to the end of the block */
 };
 #ifdef __KERNEL__
@@ -958,12 +974,17 @@ extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
 extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
 extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
                        ext4_group_t group);
-extern ext4_fsblk_t ext4_new_block (handle_t *handle, struct inode *inode,
+extern ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
                        ext4_fsblk_t goal, int *errp);
-extern ext4_fsblk_t ext4_new_blocks (handle_t *handle, struct inode *inode,
+extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
                        ext4_fsblk_t goal, unsigned long *count, int *errp);
-extern ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
+extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+                                        ext4_lblk_t iblock, ext4_fsblk_t goal,
+                                        unsigned long *count, int *errp);
+extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
                        ext4_fsblk_t goal, unsigned long *count, int *errp);
+extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
+                                                ext4_fsblk_t nblocks);
 extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
                        ext4_fsblk_t block, unsigned long count, int metadata);
 extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb,
@@ -1016,9 +1037,14 @@ extern int __init init_ext4_mballoc(void);
 extern void exit_ext4_mballoc(void);
 extern void ext4_mb_free_blocks(handle_t *, struct inode *,
                unsigned long, unsigned long, int, unsigned long *);
+extern int ext4_mb_add_more_groupinfo(struct super_block *sb,
+                ext4_group_t i, struct ext4_group_desc *desc);
+extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
+                ext4_grpblk_t add);
 /* inode.c */
+void ext4_da_release_space(struct inode *inode, int used, int to_free);
 int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
                struct buffer_head *bh, ext4_fsblk_t blocknr);
 struct buffer_head *ext4_getblk(handle_t *, struct inode *,
@@ -1033,19 +1059,23 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
 extern struct inode *ext4_iget(struct super_block *, unsigned long);
 extern int  ext4_write_inode (struct inode *, int);
 extern int  ext4_setattr (struct dentry *, struct iattr *);
+extern int  ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
+                                struct kstat *stat);
 extern void ext4_delete_inode (struct inode *);
 extern int  ext4_sync_inode (handle_t *, struct inode *);
 extern void ext4_discard_reservation (struct inode *);
 extern void ext4_dirty_inode(struct inode *);
 extern int ext4_change_inode_journal_flag(struct inode *, int);
 extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
+extern int ext4_can_truncate(struct inode *inode);
 extern void ext4_truncate (struct inode *);
 extern void ext4_set_inode_flags(struct inode *);
 extern void ext4_get_inode_flags(struct ext4_inode_info *);
 extern void ext4_set_aops(struct inode *inode);
 extern int ext4_writepage_trans_blocks(struct inode *);
-extern int ext4_block_truncate_page(handle_t *handle, struct page *page,
+extern int ext4_block_truncate_page(handle_t *handle,
                struct address_space *mapping, loff_t from);
+extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
 /* ioctl.c */
 extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
@@ -1159,10 +1189,21 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
 }
+static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
+                                             ext4_group_t block_group)
+{
+        return block_group >> sbi->s_log_groups_per_flex;
+}
+static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
+{
+        return 1 << sbi->s_log_groups_per_flex;
+}
 #define ext4_std_error(sb, errno)                               \
 do {                                                            \
        if ((errno))                                            \
-                __ext4_std_error((sb), __FUNCTION__, (errno));  \
+                __ext4_std_error((sb), __func__, (errno));      \
 } while (0)
 /*
@@ -1191,7 +1232,7 @@ extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                        ext4_lblk_t iblock,
                        unsigned long max_blocks, struct buffer_head *bh_result,
                        int create, int extend_disksize);
-extern void ext4_ext_truncate(struct inode *, struct page *);
+extern void ext4_ext_truncate(struct inode *);
 extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
 extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
@@ -1199,7 +1240,7 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
 extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
                        sector_t block, unsigned long max_blocks,
                        struct buffer_head *bh, int create,
-                        int extend_disksize);
+                        int extend_disksize, int flag);
 #endif  /* __KERNEL__ */
 #endif  /* _EXT4_H */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 75333b595fab..6c166c0a54b7 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -212,6 +212,7 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
                (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
 }
+extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
 extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
 extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
 extern int ext4_extent_tree_init(handle_t *, struct inode *);
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index 26a4ae255d79..ef7409f0e7e4 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -79,7 +79,7 @@ struct ext4_ext_cache {
 };
 /*
- * third extended file system inode data in memory
+ * fourth extended file system inode data in memory
 */
 struct ext4_inode_info {
        __le32  i_data[15];     /* unconverted */
@@ -150,6 +150,7 @@ struct ext4_inode_info {
         */
        struct rw_semaphore i_data_sem;
        struct inode vfs_inode;
+        struct jbd2_inode jinode;
        unsigned long i_ext_generation;
        struct ext4_ext_cache i_cached_extent;
@@ -162,6 +163,13 @@ struct ext4_inode_info {
        /* mballoc */
        struct list_head i_prealloc_list;
        spinlock_t i_prealloc_lock;
+        /* allocation reservation info for delalloc */
+        unsigned long i_reserved_data_blocks;
+        unsigned long i_reserved_meta_blocks;
+        unsigned long i_allocated_meta_blocks;
+        unsigned short i_delalloc_reserved_flag;
+        spinlock_t i_block_reservation_lock;
 };
 #endif  /* _EXT4_I */
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 9255a7d28b24..eb8bc3afe6e9 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -142,19 +142,17 @@ int __ext4_journal_dirty_metadata(const char *where,
                                handle_t *handle, struct buffer_head *bh);
 #define ext4_journal_get_undo_access(handle, bh) \
-        __ext4_journal_get_undo_access(__FUNCTION__, (handle), (bh))
+        __ext4_journal_get_undo_access(__func__, (handle), (bh))
 #define ext4_journal_get_write_access(handle, bh) \
-        __ext4_journal_get_write_access(__FUNCTION__, (handle), (bh))
+        __ext4_journal_get_write_access(__func__, (handle), (bh))
 #define ext4_journal_revoke(handle, blocknr, bh) \
-        __ext4_journal_revoke(__FUNCTION__, (handle), (blocknr), (bh))
+        __ext4_journal_revoke(__func__, (handle), (blocknr), (bh))
 #define ext4_journal_get_create_access(handle, bh) \
-        __ext4_journal_get_create_access(__FUNCTION__, (handle), (bh))
+        __ext4_journal_get_create_access(__func__, (handle), (bh))
 #define ext4_journal_dirty_metadata(handle, bh) \
-        __ext4_journal_dirty_metadata(__FUNCTION__, (handle), (bh))
+        __ext4_journal_dirty_metadata(__func__, (handle), (bh))
 #define ext4_journal_forget(handle, bh) \
-        __ext4_journal_forget(__FUNCTION__, (handle), (bh))
+        __ext4_journal_forget(__func__, (handle), (bh))
-int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh);
 handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
 int __ext4_journal_stop(const char *where, handle_t *handle);
@@ -165,7 +163,7 @@ static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
 }
 #define ext4_journal_stop(handle) \
-        __ext4_journal_stop(__FUNCTION__, (handle))
+        __ext4_journal_stop(__func__, (handle))
 static inline handle_t *ext4_journal_current_handle(void)
 {
@@ -192,6 +190,11 @@ static inline int ext4_journal_force_commit(journal_t *journal)
        return jbd2_journal_force_commit(journal);
 }
+static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
+{
+        return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
+}
 /* super.c */
 int ext4_force_commit(struct super_block *sb);
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 5802e69f2191..6300226d5531 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -25,7 +25,7 @@
 #include <linux/rbtree.h>
 /*
- * third extended-fs super-block data in memory
+ * fourth extended-fs super-block data in memory
 */
 struct ext4_sb_info {
        unsigned long s_desc_size;      /* Size of a group descriptor in bytes */
@@ -143,6 +143,9 @@ struct ext4_sb_info {
        /* locality groups */
        struct ext4_locality_group *s_locality_groups;
+        unsigned int s_log_groups_per_flex;
+        struct flex_groups *s_flex_groups;
 };
 #endif  /* _EXT4_SB */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 47929c4e3dae..42c4c0c892ed 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -92,17 +92,16 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
        ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
 }
-static handle_t *ext4_ext_journal_restart(handle_t *handle, int needed)
+static int ext4_ext_journal_restart(handle_t *handle, int needed)
 {
        int err;
        if (handle->h_buffer_credits > needed)
-                return handle;
+                return 0;
-        if (!ext4_journal_extend(handle, needed))
+        err = ext4_journal_extend(handle, needed);
-                return handle;
+        if (err)
-        err = ext4_journal_restart(handle, needed);
+                return err;
+        return ext4_journal_restart(handle, needed);
-        return handle;
 }
 /*
@@ -180,15 +179,18 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
        return bg_start + colour + block;
 }
+/*
+ * Allocation for a meta data block
+ */
 static ext4_fsblk_t
-ext4_ext_new_block(handle_t *handle, struct inode *inode,
+ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
                        struct ext4_ext_path *path,
                        struct ext4_extent *ex, int *err)
 {
        ext4_fsblk_t goal, newblock;
        goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
-        newblock = ext4_new_block(handle, inode, goal, err);
+        newblock = ext4_new_meta_block(handle, inode, goal, err);
        return newblock;
 }
@@ -246,6 +248,36 @@ static int ext4_ext_space_root_idx(struct inode *inode)
        return size;
 }
+/*
+ * Calculate the number of metadata blocks needed
+ * to allocate @blocks
+ * Worse case is one block per extent
+ */
+int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
+{
+        int lcap, icap, rcap, leafs, idxs, num;
+        int newextents = blocks;
+        rcap = ext4_ext_space_root_idx(inode);
+        lcap = ext4_ext_space_block(inode);
+        icap = ext4_ext_space_block_idx(inode);
+        /* number of new leaf blocks needed */
+        num = leafs = (newextents + lcap - 1) / lcap;
+        /*
+         * Worse case, we need separate index block(s)
+         * to link all new leaf blocks
+         */
+        idxs = (leafs + icap - 1) / icap;
+        do {
+                num += idxs;
+                idxs = (idxs + icap - 1) / icap;
+        } while (idxs > rcap);
+        return num;
+}
 static int
 ext4_ext_max_entries(struct inode *inode, int depth)
 {
@@ -524,6 +556,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
                alloc = 1;
        }
        path[0].p_hdr = eh;
+        path[0].p_bh = NULL;
        i = depth;
        /* walk through the tree */
@@ -552,12 +585,14 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
        }
        path[ppos].p_depth = i;
-        path[ppos].p_hdr = eh;
        path[ppos].p_ext = NULL;
        path[ppos].p_idx = NULL;
        /* find extent */
        ext4_ext_binsearch(inode, path + ppos, block);
+        /* if not an empty leaf */
+        if (path[ppos].p_ext)
+                path[ppos].p_block = ext_pblock(path[ppos].p_ext);
        ext4_ext_show_path(inode, path);
@@ -688,7 +723,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
        /* allocate all needed blocks */
        ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
        for (a = 0; a < depth - at; a++) {
-                newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
+                newblock = ext4_ext_new_meta_block(handle, inode, path,
+                                                   newext, &err);
                if (newblock == 0)
                        goto cleanup;
                ablocks[a] = newblock;
@@ -884,7 +920,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
        ext4_fsblk_t newblock;
        int err = 0;
-        newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
+        newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err);
        if (newblock == 0)
                return err;
@@ -981,6 +1017,8 @@ repeat:
                /* if we found index with free entry, then use that
                 * entry: create all needed subtree and add new leaf */
                err = ext4_ext_split(handle, inode, path, newext, i);
+                if (err)
+                        goto out;
                /* refill path */
                ext4_ext_drop_refs(path);
@@ -1883,11 +1921,9 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
 #endif
-                handle = ext4_ext_journal_restart(handle, credits);
+                err = ext4_ext_journal_restart(handle, credits);
-                if (IS_ERR(handle)) {
+                if (err)
-                        err = PTR_ERR(handle);
                        goto out;
-                }
                err = ext4_ext_get_access(handle, inode, path + depth);
                if (err)
@@ -2529,6 +2565,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        int err = 0, depth, ret;
        unsigned long allocated = 0;
        struct ext4_allocation_request ar;
+        loff_t disksize;
        __clear_bit(BH_New, &bh_result->b_state);
        ext_debug("blocks %u/%lu requested for inode %u\n",
@@ -2616,8 +2653,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                                 */
                                if (allocated > max_blocks)
                                        allocated = max_blocks;
-                                /* mark the buffer unwritten */
+                                set_buffer_unwritten(bh_result);
-                                __set_bit(BH_Unwritten, &bh_result->b_state);
                                goto out2;
                        }
@@ -2716,14 +2752,19 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                goto out2;
        }
-        if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize)
-                EXT4_I(inode)->i_disksize = inode->i_size;
        /* previous routine could use block we allocated */
        newblock = ext_pblock(&newex);
        allocated = ext4_ext_get_actual_len(&newex);
 outnew:
-        __set_bit(BH_New, &bh_result->b_state);
+        if (extend_disksize) {
+                disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits;
+                if (disksize > i_size_read(inode))
+                        disksize = i_size_read(inode);
+                if (disksize > EXT4_I(inode)->i_disksize)
+                        EXT4_I(inode)->i_disksize = disksize;
+        }
+        set_buffer_new(bh_result);
        /* Cache only when it is _not_ an uninitialized extent */
        if (create != EXT4_CREATE_UNINITIALIZED_EXT)
@@ -2733,7 +2774,7 @@ out:
        if (allocated > max_blocks)
                allocated = max_blocks;
        ext4_ext_show_leaf(inode, path);
-        __set_bit(BH_Mapped, &bh_result->b_state);
+        set_buffer_mapped(bh_result);
        bh_result->b_bdev = inode->i_sb->s_bdev;
        bh_result->b_blocknr = newblock;
 out2:
@@ -2744,7 +2785,7 @@ out2:
        return err ? err : allocated;
 }
-void ext4_ext_truncate(struct inode * inode, struct page *page)
+void ext4_ext_truncate(struct inode *inode)
 {
        struct address_space *mapping = inode->i_mapping;
        struct super_block *sb = inode->i_sb;
@@ -2757,18 +2798,14 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
         */
        err = ext4_writepage_trans_blocks(inode) + 3;
        handle = ext4_journal_start(inode, err);
-        if (IS_ERR(handle)) {
+        if (IS_ERR(handle))
-                if (page) {
-                        clear_highpage(page);
-                        flush_dcache_page(page);
-                        unlock_page(page);
-                        page_cache_release(page);
-                }
                return;
-        }
-        if (page)
+        if (inode->i_size & (sb->s_blocksize - 1))
-                ext4_block_truncate_page(handle, page, mapping, inode->i_size);
+                ext4_block_truncate_page(handle, mapping, inode->i_size);
+        if (ext4_orphan_add(handle, inode))
+                goto out_stop;
        down_write(&EXT4_I(inode)->i_data_sem);
        ext4_ext_invalidate_cache(inode);
@@ -2780,8 +2817,6 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
         * Probably we need not scan at all,
         * because page truncation is enough.
         */
-        if (ext4_orphan_add(handle, inode))
-                goto out_stop;
        /* we have to know where to truncate from in crash case */
        EXT4_I(inode)->i_disksize = inode->i_size;
@@ -2798,6 +2833,7 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
                handle->h_sync = 1;
 out_stop:
+        up_write(&EXT4_I(inode)->i_data_sem);
        /*
         * If this was a simple ftruncate() and the file will remain alive,
         * then we need to clear up the orphan record which we created above.
@@ -2808,7 +2844,6 @@ out_stop:
        if (inode->i_nlink)
                ext4_orphan_del(handle, inode);
-        up_write(&EXT4_I(inode)->i_data_sem);
        inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
        ext4_mark_inode_dirty(handle, inode);
        ext4_journal_stop(handle);
@@ -2911,7 +2946,7 @@ retry:
                }
                ret = ext4_get_blocks_wrap(handle, inode, block,
                                          max_blocks, &map_bh,
-                                          EXT4_CREATE_UNINITIALIZED_EXT, 0);
+                                          EXT4_CREATE_UNINITIALIZED_EXT, 0, 0);
                if (ret <= 0) {
 #ifdef EXT4FS_DEBUG
                        WARN_ON(ret <= 0);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 4159be6366ab..430eb7978db4 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -123,6 +123,23 @@ force_commit:
        return ret;
 }
+static struct vm_operations_struct ext4_file_vm_ops = {
+        .fault          = filemap_fault,
+        .page_mkwrite   = ext4_page_mkwrite,
+};
+static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+        struct address_space *mapping = file->f_mapping;
+        if (!mapping->a_ops->readpage)
+                return -ENOEXEC;
+        file_accessed(file);
+        vma->vm_ops = &ext4_file_vm_ops;
+        vma->vm_flags |= VM_CAN_NONLINEAR;
+        return 0;
+}
 const struct file_operations ext4_file_operations = {
        .llseek         = generic_file_llseek,
        .read           = do_sync_read,
@@ -133,7 +150,7 @@ const struct file_operations ext4_file_operations = {
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ext4_compat_ioctl,
 #endif
-        .mmap           = generic_file_mmap,
+        .mmap           = ext4_file_mmap,
        .open           = generic_file_open,
        .release        = ext4_release_file,
        .fsync          = ext4_sync_file,
@@ -144,6 +161,7 @@ const struct file_operations ext4_file_operations = {
 const struct inode_operations ext4_file_inode_operations = {
        .truncate       = ext4_truncate,
        .setattr        = ext4_setattr,
+        .getattr        = ext4_getattr,
 #ifdef CONFIG_EXT4DEV_FS_XATTR
        .setxattr       = generic_setxattr,
        .getxattr       = generic_getxattr,
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 1c8ba48d4f8d..a45c3737ad31 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -27,6 +27,7 @@
 #include <linux/sched.h>
 #include <linux/writeback.h>
 #include <linux/jbd2.h>
+#include <linux/blkdev.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
@@ -45,6 +46,7 @@
 int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
 {
        struct inode *inode = dentry->d_inode;
+        journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
        int ret = 0;
        J_ASSERT(ext4_journal_current_handle() == NULL);
@@ -85,6 +87,8 @@ int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
                        .nr_to_write = 0, /* sys_fsync did this */
                };
                ret = sync_inode(inode, &wbc);
+                if (journal && (journal->j_flags & JBD2_BARRIER))
+                        blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
        }
 out:
        return ret;
diff --git a/fs/ext4/group.h b/fs/ext4/group.h
index 7eb0604e7eea..c2c0a8d06d0e 100644
--- a/fs/ext4/group.h
+++ b/fs/ext4/group.h
@@ -13,7 +13,7 @@ extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
                                   struct ext4_group_desc *gdp);
 extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
                                       struct ext4_group_desc *gdp);
-struct buffer_head *read_block_bitmap(struct super_block *sb,
+struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
                                      ext4_group_t block_group);
 extern unsigned ext4_init_block_bitmap(struct super_block *sb,
                                       struct buffer_head *bh,
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index c6efbab0c801..a92eb305344f 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -157,6 +157,7 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
        struct ext4_super_block * es;
        struct ext4_sb_info *sbi;
        int fatal = 0, err;
+        ext4_group_t flex_group;
        if (atomic_read(&inode->i_count) > 1) {
                printk ("ext4_free_inode: inode has count=%d\n",
@@ -232,6 +233,12 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
                        if (is_directory)
                                percpu_counter_dec(&sbi->s_dirs_counter);
+                        if (sbi->s_log_groups_per_flex) {
+                                flex_group = ext4_flex_group(sbi, block_group);
+                                spin_lock(sb_bgl_lock(sbi, flex_group));
+                                sbi->s_flex_groups[flex_group].free_inodes++;
+                                spin_unlock(sb_bgl_lock(sbi, flex_group));
+                        }
                }
                BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
                err = ext4_journal_dirty_metadata(handle, bh2);
@@ -286,6 +293,80 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
        return ret;
 }
+#define free_block_ratio 10
+static int find_group_flex(struct super_block *sb, struct inode *parent,
+                           ext4_group_t *best_group)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_group_desc *desc;
+        struct buffer_head *bh;
+        struct flex_groups *flex_group = sbi->s_flex_groups;
+        ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
+        ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group);
+        ext4_group_t ngroups = sbi->s_groups_count;
+        int flex_size = ext4_flex_bg_size(sbi);
+        ext4_group_t best_flex = parent_fbg_group;
+        int blocks_per_flex = sbi->s_blocks_per_group * flex_size;
+        int flexbg_free_blocks;
+        int flex_freeb_ratio;
+        ext4_group_t n_fbg_groups;
+        ext4_group_t i;
+        n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >>
+                sbi->s_log_groups_per_flex;
+find_close_to_parent:
+        flexbg_free_blocks = flex_group[best_flex].free_blocks;
+        flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
+        if (flex_group[best_flex].free_inodes &&
+            flex_freeb_ratio > free_block_ratio)
+                goto found_flexbg;
+        if (best_flex && best_flex == parent_fbg_group) {
+                best_flex--;
+                goto find_close_to_parent;
+        }
+        for (i = 0; i < n_fbg_groups; i++) {
+                if (i == parent_fbg_group || i == parent_fbg_group - 1)
+                        continue;
+                flexbg_free_blocks = flex_group[i].free_blocks;
+                flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
+                if (flex_freeb_ratio > free_block_ratio &&
+                    flex_group[i].free_inodes) {
+                        best_flex = i;
+                        goto found_flexbg;
+                }
+                if (best_flex < 0 ||
+                    (flex_group[i].free_blocks >
+                     flex_group[best_flex].free_blocks &&
+                     flex_group[i].free_inodes))
+                        best_flex = i;
+        }
+        if (!flex_group[best_flex].free_inodes ||
+            !flex_group[best_flex].free_blocks)
+                return -1;
+found_flexbg:
+        for (i = best_flex * flex_size; i < ngroups &&
+                     i < (best_flex + 1) * flex_size; i++) {
+                desc = ext4_get_group_desc(sb, i, &bh);
+                if (le16_to_cpu(desc->bg_free_inodes_count)) {
+                        *best_group = i;
+                        goto out;
+                }
+        }
+        return -1;
+out:
+        return 0;
+}
 /*
 * Orlov's allocator for directories.
 *
@@ -501,6 +582,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
        struct inode *ret;
        ext4_group_t i;
        int free = 0;
+        ext4_group_t flex_group;
        /* Cannot create files in a deleted directory */
        if (!dir || !dir->i_nlink)
@@ -514,6 +596,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
        sbi = EXT4_SB(sb);
        es = sbi->s_es;
+        if (sbi->s_log_groups_per_flex) {
+                ret2 = find_group_flex(sb, dir, &group);
+                goto got_group;
+        }
        if (S_ISDIR(mode)) {
                if (test_opt (sb, OLDALLOC))
                        ret2 = find_group_dir(sb, dir, &group);
@@ -522,6 +610,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
        } else
                ret2 = find_group_other(sb, dir, &group);
+got_group:
        err = -ENOSPC;
        if (ret2 == -1)
                goto out;
@@ -600,7 +689,7 @@ got:
        /* We may have to initialize the block bitmap if it isn't already */
        if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
            gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
-                struct buffer_head *block_bh = read_block_bitmap(sb, group);
+                struct buffer_head *block_bh = ext4_read_block_bitmap(sb, group);
                BUFFER_TRACE(block_bh, "get block bitmap access");
                err = ext4_journal_get_write_access(handle, block_bh);
@@ -676,6 +765,13 @@ got:
                percpu_counter_inc(&sbi->s_dirs_counter);
        sb->s_dirt = 1;
+        if (sbi->s_log_groups_per_flex) {
+                flex_group = ext4_flex_group(sbi, group);
+                spin_lock(sb_bgl_lock(sbi, flex_group));
+                sbi->s_flex_groups[flex_group].free_inodes--;
+                spin_unlock(sb_bgl_lock(sbi, flex_group));
+        }
        inode->i_uid = current->fsuid;
        if (test_opt (sb, GRPID))
                inode->i_gid = dir->i_gid;
@@ -740,14 +836,10 @@ got:
                goto fail_free_drop;
        if (test_opt(sb, EXTENTS)) {
-                /* set extent flag only for diretory, file and normal symlink*/
+                /* set extent flag only for directory, file and normal symlink*/
                if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
                        EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
                        ext4_ext_tree_init(handle, inode);
-                        err = ext4_update_incompat_feature(handle, sb,
-                                        EXT4_FEATURE_INCOMPAT_EXTENTS);
-                        if (err)
-                                goto fail_free_drop;
                }
        }
@@ -817,6 +909,14 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
        if (IS_ERR(inode))
                goto iget_failed;
+        /*
+         * If the orphans has i_nlinks > 0 then it should be able to be
+         * truncated, otherwise it won't be removed from the orphan list
+         * during processing and an infinite loop will result.
+         */
+        if (inode->i_nlink && !ext4_can_truncate(inode))
+                goto bad_orphan;
        if (NEXT_ORPHAN(inode) > max_ino)
                goto bad_orphan;
        brelse(bitmap_bh);
@@ -838,6 +938,7 @@ bad_orphan:
                printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
                       NEXT_ORPHAN(inode));
                printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
+                printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink);
                /* Avoid freeing blocks if we got a bad deleted inode */
                if (inode->i_nlink == 0)
                        inode->i_blocks = 0;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8d9707746413..8ca2763df091 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -32,12 +32,23 @@
 #include <linux/string.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
+#include <linux/pagevec.h>
 #include <linux/mpage.h>
 #include <linux/uio.h>
 #include <linux/bio.h>
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
+#include "ext4_extents.h"
+static inline int ext4_begin_ordered_truncate(struct inode *inode,
+                                              loff_t new_size)
+{
+        return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode,
+                                                   new_size);
+}
+static void ext4_invalidatepage(struct page *page, unsigned long offset);
 /*
 * Test whether an inode is a fast symlink.
@@ -181,6 +192,8 @@ void ext4_delete_inode (struct inode * inode)
 {
        handle_t *handle;
+        if (ext4_should_order_data(inode))
+                ext4_begin_ordered_truncate(inode, 0);
        truncate_inode_pages(&inode->i_data, 0);
        if (is_bad_inode(inode))
@@ -508,11 +521,12 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
 *              direct blocks
 */
 static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
-                        ext4_fsblk_t goal, int indirect_blks, int blks,
+                                ext4_lblk_t iblock, ext4_fsblk_t goal,
-                        ext4_fsblk_t new_blocks[4], int *err)
+                                int indirect_blks, int blks,
+                                ext4_fsblk_t new_blocks[4], int *err)
 {
        int target, i;
-        unsigned long count = 0;
+        unsigned long count = 0, blk_allocated = 0;
        int index = 0;
        ext4_fsblk_t current_block = 0;
        int ret = 0;
@@ -525,12 +539,13 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
         * the first direct block of this branch.  That's the
         * minimum number of blocks need to allocate(required)
         */
-        target = blks + indirect_blks;
+        /* first we try to allocate the indirect blocks */
+        target = indirect_blks;
-        while (1) {
+        while (target > 0) {
                count = target;
                /* allocating blocks for indirect blocks and direct blocks */
-                current_block = ext4_new_blocks(handle,inode,goal,&count,err);
+                current_block = ext4_new_meta_blocks(handle, inode,
+                                                        goal, &count, err);
                if (*err)
                        goto failed_out;
@@ -540,16 +555,48 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
                        new_blocks[index++] = current_block++;
                        count--;
                }
+                if (count > 0) {
-                if (count > 0)
+                        /*
+                         * save the new block number
+                         * for the first direct block
+                         */
+                        new_blocks[index] = current_block;
+                        printk(KERN_INFO "%s returned more blocks than "
+                                                "requested\n", __func__);
+                        WARN_ON(1);
                        break;
+                }
        }
-        /* save the new block number for the first direct block */
+        target = blks - count ;
-        new_blocks[index] = current_block;
+        blk_allocated = count;
+        if (!target)
+                goto allocated;
+        /* Now allocate data blocks */
+        count = target;
+        /* allocating blocks for data blocks */
+        current_block = ext4_new_blocks(handle, inode, iblock,
+                                                goal, &count, err);
+        if (*err && (target == blks)) {
+                /*
+                 * if the allocation failed and we didn't allocate
+                 * any blocks before
+                 */
+                goto failed_out;
+        }
+        if (!*err) {
+                if (target == blks) {
+                /*
+                 * save the new block number
+                 * for the first direct block
+                 */
+                        new_blocks[index] = current_block;
+                }
+                blk_allocated += count;
+        }
+allocated:
        /* total number of blocks allocated for direct blocks */
-        ret = count;
+        ret = blk_allocated;
        *err = 0;
        return ret;
 failed_out:
@@ -584,8 +631,9 @@ failed_out:
 *      as described above and return 0.
 */
 static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
-                        int indirect_blks, int *blks, ext4_fsblk_t goal,
+                                ext4_lblk_t iblock, int indirect_blks,
-                        ext4_lblk_t *offsets, Indirect *branch)
+                                int *blks, ext4_fsblk_t goal,
+                                ext4_lblk_t *offsets, Indirect *branch)
 {
        int blocksize = inode->i_sb->s_blocksize;
        int i, n = 0;
@@ -595,7 +643,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
        ext4_fsblk_t new_blocks[4];
        ext4_fsblk_t current_block;
-        num = ext4_alloc_blocks(handle, inode, goal, indirect_blks,
+        num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
                                *blks, new_blocks, &err);
        if (err)
                return err;
@@ -799,6 +847,7 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
        struct ext4_inode_info *ei = EXT4_I(inode);
        int count = 0;
        ext4_fsblk_t first_block = 0;
+        loff_t disksize;
        J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
@@ -855,8 +904,9 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
        /*
         * Block out ext4_truncate while we alter the tree
         */
-        err = ext4_alloc_branch(handle, inode, indirect_blks, &count, goal,
+        err = ext4_alloc_branch(handle, inode, iblock, indirect_blks,
-                                offsets + (partial - chain), partial);
+                                        &count, goal,
+                                        offsets + (partial - chain), partial);
        /*
         * The ext4_splice_branch call will free and forget any buffers
@@ -873,8 +923,13 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
         * protect it if you're about to implement concurrent
         * ext4_get_block() -bzzz
        */
-        if (!err && extend_disksize && inode->i_size > ei->i_disksize)
+        if (!err && extend_disksize) {
-                ei->i_disksize = inode->i_size;
+                disksize = ((loff_t) iblock + count) << inode->i_blkbits;
+                if (disksize > i_size_read(inode))
+                        disksize = i_size_read(inode);
+                if (disksize > ei->i_disksize)
+                        ei->i_disksize = disksize;
+        }
        if (err)
                goto cleanup;
@@ -934,7 +989,7 @@ out:
 */
 int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
                        unsigned long max_blocks, struct buffer_head *bh,
-                        int create, int extend_disksize)
+                        int create, int extend_disksize, int flag)
 {
        int retval;
@@ -975,6 +1030,15 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
         * with create == 1 flag.
         */
        down_write((&EXT4_I(inode)->i_data_sem));
+        /*
+         * if the caller is from delayed allocation writeout path
+         * we have already reserved fs blocks for allocation
+         * let the underlying get_block() function know to
+         * avoid double accounting
+         */
+        if (flag)
+                EXT4_I(inode)->i_delalloc_reserved_flag = 1;
        /*
         * We need to check for EXT4 here because migrate
         * could have changed the inode type in between
@@ -996,6 +1060,18 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
                                                        ~EXT4_EXT_MIGRATE;
                }
        }
+        if (flag) {
+                EXT4_I(inode)->i_delalloc_reserved_flag = 0;
+                /*
+                 * Update reserved blocks/metadata blocks
+                 * after successful block allocation
+                 * which were deferred till now
+                 */
+                if ((retval > 0) && buffer_delay(bh))
+                        ext4_da_release_space(inode, retval, 0);
+        }
        up_write((&EXT4_I(inode)->i_data_sem));
        return retval;
 }
@@ -1021,7 +1097,7 @@ static int ext4_get_block(struct inode *inode, sector_t iblock,
        }
        ret = ext4_get_blocks_wrap(handle, inode, iblock,
-                                        max_blocks, bh_result, create, 0);
+                                        max_blocks, bh_result, create, 0, 0);
        if (ret > 0) {
                bh_result->b_size = (ret << inode->i_blkbits);
                ret = 0;
@@ -1047,7 +1123,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
        dummy.b_blocknr = -1000;
        buffer_trace_init(&dummy.b_history);
        err = ext4_get_blocks_wrap(handle, inode, block, 1,
-                                        &dummy, create, 1);
+                                        &dummy, create, 1, 0);
        /*
         * ext4_get_blocks_handle() returns number of blocks
         * mapped. 0 in case of a HOLE.
@@ -1203,19 +1279,20 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
        to = from + len;
 retry:
-        page = __grab_cache_page(mapping, index);
-        if (!page)
-                return -ENOMEM;
-        *pagep = page;
        handle = ext4_journal_start(inode, needed_blocks);
        if (IS_ERR(handle)) {
-                unlock_page(page);
-                page_cache_release(page);
                ret = PTR_ERR(handle);
                goto out;
        }
+        page = __grab_cache_page(mapping, index);
+        if (!page) {
+                ext4_journal_stop(handle);
+                ret = -ENOMEM;
+                goto out;
+        }
+        *pagep = page;
        ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
                                                        ext4_get_block);
@@ -1225,8 +1302,8 @@ retry:
        }
        if (ret) {
-                ext4_journal_stop(handle);
                unlock_page(page);
+                ext4_journal_stop(handle);
                page_cache_release(page);
        }
@@ -1236,15 +1313,6 @@ out:
        return ret;
 }
-int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
-{
-        int err = jbd2_journal_dirty_data(handle, bh);
-        if (err)
-                ext4_journal_abort_handle(__func__, __func__,
-                                                bh, handle, err);
-        return err;
-}
 /* For write_end() in data=journal mode */
 static int write_end_fn(handle_t *handle, struct buffer_head *bh)
 {
@@ -1255,29 +1323,6 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
 }
 /*
- * Generic write_end handler for ordered and writeback ext4 journal modes.
- * We can't use generic_write_end, because that unlocks the page and we need to
- * unlock the page after ext4_journal_stop, but ext4_journal_stop must run
- * after block_write_end.
- */
-static int ext4_generic_write_end(struct file *file,
-                                struct address_space *mapping,
-                                loff_t pos, unsigned len, unsigned copied,
-                                struct page *page, void *fsdata)
-{
-        struct inode *inode = file->f_mapping->host;
-        copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
-        if (pos+copied > inode->i_size) {
-                i_size_write(inode, pos+copied);
-                mark_inode_dirty(inode);
-        }
-        return copied;
-}
-/*
 * We need to pick up the new inode size which generic_commit_write gave us
 * `file' can be NULL - eg, when called from page_symlink().
 *
@@ -1290,15 +1335,14 @@ static int ext4_ordered_write_end(struct file *file,
                                struct page *page, void *fsdata)
 {
        handle_t *handle = ext4_journal_current_handle();
-        struct inode *inode = file->f_mapping->host;
+        struct inode *inode = mapping->host;
        unsigned from, to;
        int ret = 0, ret2;
        from = pos & (PAGE_CACHE_SIZE - 1);
        to = from + len;
-        ret = walk_page_buffers(handle, page_buffers(page),
+        ret = ext4_jbd2_file_inode(handle, inode);
-                from, to, NULL, ext4_journal_dirty_data);
        if (ret == 0) {
                /*
@@ -1311,7 +1355,7 @@ static int ext4_ordered_write_end(struct file *file,
                new_i_size = pos + copied;
                if (new_i_size > EXT4_I(inode)->i_disksize)
                        EXT4_I(inode)->i_disksize = new_i_size;
-                ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
+                ret2 = generic_write_end(file, mapping, pos, len, copied,
                                                        page, fsdata);
                copied = ret2;
                if (ret2 < 0)
@@ -1320,8 +1364,6 @@ static int ext4_ordered_write_end(struct file *file,
        ret2 = ext4_journal_stop(handle);
        if (!ret)
                ret = ret2;
-        unlock_page(page);
-        page_cache_release(page);
        return ret ? ret : copied;
 }
@@ -1332,7 +1374,7 @@ static int ext4_writeback_write_end(struct file *file,
                                struct page *page, void *fsdata)
 {
        handle_t *handle = ext4_journal_current_handle();
-        struct inode *inode = file->f_mapping->host;
+        struct inode *inode = mapping->host;
        int ret = 0, ret2;
        loff_t new_i_size;
@@ -1340,7 +1382,7 @@ static int ext4_writeback_write_end(struct file *file,
        if (new_i_size > EXT4_I(inode)->i_disksize)
                EXT4_I(inode)->i_disksize = new_i_size;
-        ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
+        ret2 = generic_write_end(file, mapping, pos, len, copied,
                                                        page, fsdata);
        copied = ret2;
        if (ret2 < 0)
@@ -1349,8 +1391,6 @@ static int ext4_writeback_write_end(struct file *file,
        ret2 = ext4_journal_stop(handle);
        if (!ret)
                ret = ret2;
-        unlock_page(page);
-        page_cache_release(page);
        return ret ? ret : copied;
 }
@@ -1389,14 +1429,965 @@ static int ext4_journalled_write_end(struct file *file,
                        ret = ret2;
        }
+        unlock_page(page);
        ret2 = ext4_journal_stop(handle);
        if (!ret)
                ret = ret2;
-        unlock_page(page);
        page_cache_release(page);
        return ret ? ret : copied;
 }
+/*
+ * Calculate the number of metadata blocks need to reserve
+ * to allocate @blocks for non extent file based file
+ */
+static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)
+{
+        int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+        int ind_blks, dind_blks, tind_blks;
+        /* number of new indirect blocks needed */
+        ind_blks = (blocks + icap - 1) / icap;
+        dind_blks = (ind_blks + icap - 1) / icap;
+        tind_blks = 1;
+        return ind_blks + dind_blks + tind_blks;
+}
+/*
+ * Calculate the number of metadata blocks need to reserve
+ * to allocate given number of blocks
+ */
+static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
+{
+        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+                return ext4_ext_calc_metadata_amount(inode, blocks);
+        return ext4_indirect_calc_metadata_amount(inode, blocks);
+}
+static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+       unsigned long md_needed, mdblocks, total = 0;
+        /*
+         * recalculate the amount of metadata blocks to reserve
+         * in order to allocate nrblocks
+         * worse case is one extent per block
+         */
+        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+        total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
+        mdblocks = ext4_calc_metadata_amount(inode, total);
+        BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks);
+        md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
+        total = md_needed + nrblocks;
+        if (ext4_has_free_blocks(sbi, total) < total) {
+                spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+                return -ENOSPC;
+        }
+        /* reduce fs free blocks counter */
+        percpu_counter_sub(&sbi->s_freeblocks_counter, total);
+        EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
+        EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
+        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+        return 0;       /* success */
+}
+void ext4_da_release_space(struct inode *inode, int used, int to_free)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+        int total, mdb, mdb_free, release;
+        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+        /* recalculate the number of metablocks still need to be reserved */
+        total = EXT4_I(inode)->i_reserved_data_blocks - used - to_free;
+        mdb = ext4_calc_metadata_amount(inode, total);
+        /* figure out how many metablocks to release */
+        BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+        mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
+        /* Account for allocated meta_blocks */
+        mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
+        release = to_free + mdb_free;
+        /* update fs free blocks counter for truncate case */
+        percpu_counter_add(&sbi->s_freeblocks_counter, release);
+        /* update per-inode reservations */
+        BUG_ON(used + to_free > EXT4_I(inode)->i_reserved_data_blocks);
+        EXT4_I(inode)->i_reserved_data_blocks -= (used + to_free);
+        BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+        EXT4_I(inode)->i_reserved_meta_blocks = mdb;
+        EXT4_I(inode)->i_allocated_meta_blocks = 0;
+        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+}
+static void ext4_da_page_release_reservation(struct page *page,
+                                                unsigned long offset)
+{
+        int to_release = 0;
+        struct buffer_head *head, *bh;
+        unsigned int curr_off = 0;
+        head = page_buffers(page);
+        bh = head;
+        do {
+                unsigned int next_off = curr_off + bh->b_size;
+                if ((offset <= curr_off) && (buffer_delay(bh))) {
+                        to_release++;
+                        clear_buffer_delay(bh);
+                }
+                curr_off = next_off;
+        } while ((bh = bh->b_this_page) != head);
+        ext4_da_release_space(page->mapping->host, 0, to_release);
+}
+/*
+ * Delayed allocation stuff
+ */
+struct mpage_da_data {
+        struct inode *inode;
+        struct buffer_head lbh;                 /* extent of blocks */
+        unsigned long first_page, next_page;    /* extent of pages */
+        get_block_t *get_block;
+        struct writeback_control *wbc;
+};
+/*
+ * mpage_da_submit_io - walks through extent of pages and try to write
+ * them with __mpage_writepage()
+ *
+ * @mpd->inode: inode
+ * @mpd->first_page: first page of the extent
+ * @mpd->next_page: page after the last page of the extent
+ * @mpd->get_block: the filesystem's block mapper function
+ *
+ * By the time mpage_da_submit_io() is called we expect all blocks
+ * to be allocated. this may be wrong if allocation failed.
+ *
+ * As pages are already locked by write_cache_pages(), we can't use it
+ */
+static int mpage_da_submit_io(struct mpage_da_data *mpd)
+{
+        struct address_space *mapping = mpd->inode->i_mapping;
+        struct mpage_data mpd_pp = {
+                .bio = NULL,
+                .last_block_in_bio = 0,
+                .get_block = mpd->get_block,
+                .use_writepage = 1,
+        };
+        int ret = 0, err, nr_pages, i;
+        unsigned long index, end;
+        struct pagevec pvec;
+        BUG_ON(mpd->next_page <= mpd->first_page);
+        pagevec_init(&pvec, 0);
+        index = mpd->first_page;
+        end = mpd->next_page - 1;
+        while (index <= end) {
+                /* XXX: optimize tail */
+                nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+                if (nr_pages == 0)
+                        break;
+                for (i = 0; i < nr_pages; i++) {
+                        struct page *page = pvec.pages[i];
+                        index = page->index;
+                        if (index > end)
+                                break;
+                        index++;
+                        err = __mpage_writepage(page, mpd->wbc, &mpd_pp);
+                        /*
+                         * In error case, we have to continue because
+                         * remaining pages are still locked
+                         * XXX: unlock and re-dirty them?
+                         */
+                        if (ret == 0)
+                                ret = err;
+                }
+                pagevec_release(&pvec);
+        }
+        if (mpd_pp.bio)
+                mpage_bio_submit(WRITE, mpd_pp.bio);
+        return ret;
+}
+/*
+ * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
+ *
+ * @mpd->inode - inode to walk through
+ * @exbh->b_blocknr - first block on a disk
+ * @exbh->b_size - amount of space in bytes
+ * @logical - first logical block to start assignment with
+ *
+ * the function goes through all passed space and put actual disk
+ * block numbers into buffer heads, dropping BH_Delay
+ */
+static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
+                                 struct buffer_head *exbh)
+{
+        struct inode *inode = mpd->inode;
+        struct address_space *mapping = inode->i_mapping;
+        int blocks = exbh->b_size >> inode->i_blkbits;
+        sector_t pblock = exbh->b_blocknr, cur_logical;
+        struct buffer_head *head, *bh;
+        unsigned long index, end;
+        struct pagevec pvec;
+        int nr_pages, i;
+        index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        pagevec_init(&pvec, 0);
+        while (index <= end) {
+                /* XXX: optimize tail */
+                nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+                if (nr_pages == 0)
+                        break;
+                for (i = 0; i < nr_pages; i++) {
+                        struct page *page = pvec.pages[i];
+                        index = page->index;
+                        if (index > end)
+                                break;
+                        index++;
+                        BUG_ON(!PageLocked(page));
+                        BUG_ON(PageWriteback(page));
+                        BUG_ON(!page_has_buffers(page));
+                        bh = page_buffers(page);
+                        head = bh;
+                        /* skip blocks out of the range */
+                        do {
+                                if (cur_logical >= logical)
+                                        break;
+                                cur_logical++;
+                        } while ((bh = bh->b_this_page) != head);
+                        do {
+                                if (cur_logical >= logical + blocks)
+                                        break;
+                                if (buffer_delay(bh)) {
+                                        bh->b_blocknr = pblock;
+                                        clear_buffer_delay(bh);
+                                } else if (buffer_mapped(bh))
+                                        BUG_ON(bh->b_blocknr != pblock);
+                                cur_logical++;
+                                pblock++;
+                        } while ((bh = bh->b_this_page) != head);
+                }
+                pagevec_release(&pvec);
+        }
+}
+/*
+ * __unmap_underlying_blocks - just a helper function to unmap
+ * set of blocks described by @bh
+ */
+static inline void __unmap_underlying_blocks(struct inode *inode,
+                                             struct buffer_head *bh)
+{
+        struct block_device *bdev = inode->i_sb->s_bdev;
+        int blocks, i;
+        blocks = bh->b_size >> inode->i_blkbits;
+        for (i = 0; i < blocks; i++)
+                unmap_underlying_metadata(bdev, bh->b_blocknr + i);
+}
+/*
+ * mpage_da_map_blocks - go through given space
+ *
+ * @mpd->lbh - bh describing space
+ * @mpd->get_block - the filesystem's block mapper function
+ *
+ * The function skips space we know is already mapped to disk blocks.
+ *
+ * The function ignores errors ->get_block() returns, thus real
+ * error handling is postponed to __mpage_writepage()
+ */
+static void mpage_da_map_blocks(struct mpage_da_data *mpd)
+{
+        struct buffer_head *lbh = &mpd->lbh;
+        int err = 0, remain = lbh->b_size;
+        sector_t next = lbh->b_blocknr;
+        struct buffer_head new;
+        /*
+         * We consider only non-mapped and non-allocated blocks
+         */
+        if (buffer_mapped(lbh) && !buffer_delay(lbh))
+                return;
+        while (remain) {
+                new.b_state = lbh->b_state;
+                new.b_blocknr = 0;
+                new.b_size = remain;
+                err = mpd->get_block(mpd->inode, next, &new, 1);
+                if (err) {
+                        /*
+                         * Rather than implement own error handling
+                         * here, we just leave remaining blocks
+                         * unallocated and try again with ->writepage()
+                         */
+                        break;
+                }
+                BUG_ON(new.b_size == 0);
+                if (buffer_new(&new))
+                        __unmap_underlying_blocks(mpd->inode, &new);
+                /*
+                 * If blocks are delayed marked, we need to
+                 * put actual blocknr and drop delayed bit
+                 */
+                if (buffer_delay(lbh))
+                        mpage_put_bnr_to_bhs(mpd, next, &new);
+                /* go for the remaining blocks */
+                next += new.b_size >> mpd->inode->i_blkbits;
+                remain -= new.b_size;
+        }
+}
+#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay))
+/*
+ * mpage_add_bh_to_extent - try to add one more block to extent of blocks
+ *
+ * @mpd->lbh - extent of blocks
+ * @logical - logical number of the block in the file
+ * @bh - bh of the block (used to access block's state)
+ *
+ * the function is used to collect contig. blocks in same state
+ */
+static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
+                                   sector_t logical, struct buffer_head *bh)
+{
+        struct buffer_head *lbh = &mpd->lbh;
+        sector_t next;
+        next = lbh->b_blocknr + (lbh->b_size >> mpd->inode->i_blkbits);
+        /*
+         * First block in the extent
+         */
+        if (lbh->b_size == 0) {
+                lbh->b_blocknr = logical;
+                lbh->b_size = bh->b_size;
+                lbh->b_state = bh->b_state & BH_FLAGS;
+                return;
+        }
+        /*
+         * Can we merge the block to our big extent?
+         */
+        if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) {
+                lbh->b_size += bh->b_size;
+                return;
+        }
+        /*
+         * We couldn't merge the block to our extent, so we
+         * need to flush current  extent and start new one
+         */
+        mpage_da_map_blocks(mpd);
+        /*
+         * Now start a new extent
+         */
+        lbh->b_size = bh->b_size;
+        lbh->b_state = bh->b_state & BH_FLAGS;
+        lbh->b_blocknr = logical;
+}
+/*
+ * __mpage_da_writepage - finds extent of pages and blocks
+ *
+ * @page: page to consider
+ * @wbc: not used, we just follow rules
+ * @data: context
+ *
+ * The function finds extents of pages and scan them for all blocks.
+ */
+static int __mpage_da_writepage(struct page *page,
+                                struct writeback_control *wbc, void *data)
+{
+        struct mpage_da_data *mpd = data;
+        struct inode *inode = mpd->inode;
+        struct buffer_head *bh, *head, fake;
+        sector_t logical;
+        /*
+         * Can we merge this page to current extent?
+         */
+        if (mpd->next_page != page->index) {
+                /*
+                 * Nope, we can't. So, we map non-allocated blocks
+                 * and start IO on them using __mpage_writepage()
+                 */
+                if (mpd->next_page != mpd->first_page) {
+                        mpage_da_map_blocks(mpd);
+                        mpage_da_submit_io(mpd);
+                }
+                /*
+                 * Start next extent of pages ...
+                 */
+                mpd->first_page = page->index;
+                /*
+                 * ... and blocks
+                 */
+                mpd->lbh.b_size = 0;
+                mpd->lbh.b_state = 0;
+                mpd->lbh.b_blocknr = 0;
+        }
+        mpd->next_page = page->index + 1;
+        logical = (sector_t) page->index <<
+                  (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        if (!page_has_buffers(page)) {
+                /*
+                 * There is no attached buffer heads yet (mmap?)
+                 * we treat the page asfull of dirty blocks
+                 */
+                bh = &fake;
+                bh->b_size = PAGE_CACHE_SIZE;
+                bh->b_state = 0;
+                set_buffer_dirty(bh);
+                set_buffer_uptodate(bh);
+                mpage_add_bh_to_extent(mpd, logical, bh);
+        } else {
+                /*
+                 * Page with regular buffer heads, just add all dirty ones
+                 */
+                head = page_buffers(page);
+                bh = head;
+                do {
+                        BUG_ON(buffer_locked(bh));
+                        if (buffer_dirty(bh))
+                                mpage_add_bh_to_extent(mpd, logical, bh);
+                        logical++;
+                } while ((bh = bh->b_this_page) != head);
+        }
+        return 0;
+}
+/*
+ * mpage_da_writepages - walk the list of dirty pages of the given
+ * address space, allocates non-allocated blocks, maps newly-allocated
+ * blocks to existing bhs and issue IO them
+ *
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ * @get_block: the filesystem's block mapper function.
+ *
+ * This is a library function, which implements the writepages()
+ * address_space_operation.
+ *
+ * In order to avoid duplication of logic that deals with partial pages,
+ * multiple bio per page, etc, we find non-allocated blocks, allocate
+ * them with minimal calls to ->get_block() and re-use __mpage_writepage()
+ *
+ * It's important that we call __mpage_writepage() only once for each
+ * involved page, otherwise we'd have to implement more complicated logic
+ * to deal with pages w/o PG_lock or w/ PG_writeback and so on.
+ *
+ * See comments to mpage_writepages()
+ */
+static int mpage_da_writepages(struct address_space *mapping,
+                               struct writeback_control *wbc,
+                               get_block_t get_block)
+{
+        struct mpage_da_data mpd;
+        int ret;
+        if (!get_block)
+                return generic_writepages(mapping, wbc);
+        mpd.wbc = wbc;
+        mpd.inode = mapping->host;
+        mpd.lbh.b_size = 0;
+        mpd.lbh.b_state = 0;
+        mpd.lbh.b_blocknr = 0;
+        mpd.first_page = 0;
+        mpd.next_page = 0;
+        mpd.get_block = get_block;
+        ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);
+        /*
+         * Handle last extent of pages
+         */
+        if (mpd.next_page != mpd.first_page) {
+                mpage_da_map_blocks(&mpd);
+                mpage_da_submit_io(&mpd);
+        }
+        return ret;
+}
+/*
+ * this is a special callback for ->write_begin() only
+ * it's intention is to return mapped block or reserve space
+ */
+static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
+                                  struct buffer_head *bh_result, int create)
+{
+        int ret = 0;
+        BUG_ON(create == 0);
+        BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
+        /*
+         * first, we need to know whether the block is allocated already
+         * preallocated blocks are unmapped but should treated
+         * the same as allocated blocks.
+         */
+        ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1,  bh_result, 0, 0, 0);
+        if ((ret == 0) && !buffer_delay(bh_result)) {
+                /* the block isn't (pre)allocated yet, let's reserve space */
+                /*
+                 * XXX: __block_prepare_write() unmaps passed block,
+                 * is it OK?
+                 */
+                ret = ext4_da_reserve_space(inode, 1);
+                if (ret)
+                        /* not enough space to reserve */
+                        return ret;
+                map_bh(bh_result, inode->i_sb, 0);
+                set_buffer_new(bh_result);
+                set_buffer_delay(bh_result);
+        } else if (ret > 0) {
+                bh_result->b_size = (ret << inode->i_blkbits);
+                ret = 0;
+        }
+        return ret;
+}
+#define         EXT4_DELALLOC_RSVED     1
+static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
+                                   struct buffer_head *bh_result, int create)
+{
+        int ret;
+        unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+        loff_t disksize = EXT4_I(inode)->i_disksize;
+        handle_t *handle = NULL;
+        handle = ext4_journal_current_handle();
+        if (!handle) {
+                ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+                                   bh_result, 0, 0, 0);
+                BUG_ON(!ret);
+        } else {
+                ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+                                   bh_result, create, 0, EXT4_DELALLOC_RSVED);
+        }
+        if (ret > 0) {
+                bh_result->b_size = (ret << inode->i_blkbits);
+                /*
+                 * Update on-disk size along with block allocation
+                 * we don't use 'extend_disksize' as size may change
+                 * within already allocated block -bzzz
+                 */
+                disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
+                if (disksize > i_size_read(inode))
+                        disksize = i_size_read(inode);
+                if (disksize > EXT4_I(inode)->i_disksize) {
+                        /*
+                         * XXX: replace with spinlock if seen contended -bzzz
+                         */
+                        down_write(&EXT4_I(inode)->i_data_sem);
+                        if (disksize > EXT4_I(inode)->i_disksize)
+                                EXT4_I(inode)->i_disksize = disksize;
+                        up_write(&EXT4_I(inode)->i_data_sem);
+                        if (EXT4_I(inode)->i_disksize == disksize) {
+                                ret = ext4_mark_inode_dirty(handle, inode);
+                                return ret;
+                        }
+                }
+                ret = 0;
+        }
+        return ret;
+}
+static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
+{
+        /*
+         * unmapped buffer is possible for holes.
+         * delay buffer is possible with delayed allocation
+         */
+        return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh));
+}
+static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
+                                   struct buffer_head *bh_result, int create)
+{
+        int ret = 0;
+        unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+        /*
+         * we don't want to do block allocation in writepage
+         * so call get_block_wrap with create = 0
+         */
+        ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks,
+                                   bh_result, 0, 0, 0);
+        if (ret > 0) {
+                bh_result->b_size = (ret << inode->i_blkbits);
+                ret = 0;
+        }
+        return ret;
+}
+/*
+ * get called vi ext4_da_writepages after taking page lock (have journal handle)
+ * get called via journal_submit_inode_data_buffers (no journal handle)
+ * get called via shrink_page_list via pdflush (no journal handle)
+ * or grab_page_cache when doing write_begin (have journal handle)
+ */
+static int ext4_da_writepage(struct page *page,
+                                struct writeback_control *wbc)
+{
+        int ret = 0;
+        loff_t size;
+        unsigned long len;
+        struct buffer_head *page_bufs;
+        struct inode *inode = page->mapping->host;
+        size = i_size_read(inode);
+        if (page->index == size >> PAGE_CACHE_SHIFT)
+                len = size & ~PAGE_CACHE_MASK;
+        else
+                len = PAGE_CACHE_SIZE;
+        if (page_has_buffers(page)) {
+                page_bufs = page_buffers(page);
+                if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+                                        ext4_bh_unmapped_or_delay)) {
+                        /*
+                         * We don't want to do  block allocation
+                         * So redirty the page and return
+                         * We may reach here when we do a journal commit
+                         * via journal_submit_inode_data_buffers.
+                         * If we don't have mapping block we just ignore
+                         * them. We can also reach here via shrink_page_list
+                         */
+                        redirty_page_for_writepage(wbc, page);
+                        unlock_page(page);
+                        return 0;
+                }
+        } else {
+                /*
+                 * The test for page_has_buffers() is subtle:
+                 * We know the page is dirty but it lost buffers. That means
+                 * that at some moment in time after write_begin()/write_end()
+                 * has been called all buffers have been clean and thus they
+                 * must have been written at least once. So they are all
+                 * mapped and we can happily proceed with mapping them
+                 * and writing the page.
+                 *
+                 * Try to initialize the buffer_heads and check whether
+                 * all are mapped and non delay. We don't want to
+                 * do block allocation here.
+                 */
+                ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
+                                                ext4_normal_get_block_write);
+                if (!ret) {
+                        page_bufs = page_buffers(page);
+                        /* check whether all are mapped and non delay */
+                        if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+                                                ext4_bh_unmapped_or_delay)) {
+                                redirty_page_for_writepage(wbc, page);
+                                unlock_page(page);
+                                return 0;
+                        }
+                } else {
+                        /*
+                         * We can't do block allocation here
+                         * so just redity the page and unlock
+                         * and return
+                         */
+                        redirty_page_for_writepage(wbc, page);
+                        unlock_page(page);
+                        return 0;
+                }
+        }
+        if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
+                ret = nobh_writepage(page, ext4_normal_get_block_write, wbc);
+        else
+                ret = block_write_full_page(page,
+                                                ext4_normal_get_block_write,
+                                                wbc);
+        return ret;
+}
+/*
+ * For now just follow the DIO way to estimate the max credits
+ * needed to write out EXT4_MAX_WRITEBACK_PAGES.
+ * todo: need to calculate the max credits need for
+ * extent based files, currently the DIO credits is based on
+ * indirect-blocks mapping way.
+ *
+ * Probably should have a generic way to calculate credits
+ * for DIO, writepages, and truncate
+ */
+#define EXT4_MAX_WRITEBACK_PAGES      DIO_MAX_BLOCKS
+#define EXT4_MAX_WRITEBACK_CREDITS    DIO_CREDITS
+static int ext4_da_writepages(struct address_space *mapping,
+                                struct writeback_control *wbc)
+{
+        struct inode *inode = mapping->host;
+        handle_t *handle = NULL;
+        int needed_blocks;
+        int ret = 0;
+        long to_write;
+        loff_t range_start = 0;
+        /*
+         * No pages to write? This is mainly a kludge to avoid starting
+         * a transaction for special inodes like journal inode on last iput()
+         * because that could violate lock ordering on umount
+         */
+        if (!mapping->nrpages)
+                return 0;
+        /*
+         * Estimate the worse case needed credits to write out
+         * EXT4_MAX_BUF_BLOCKS pages
+         */
+        needed_blocks = EXT4_MAX_WRITEBACK_CREDITS;
+        to_write = wbc->nr_to_write;
+        if (!wbc->range_cyclic) {
+                /*
+                 * If range_cyclic is not set force range_cont
+                 * and save the old writeback_index
+                 */
+                wbc->range_cont = 1;
+                range_start =  wbc->range_start;
+        }
+        while (!ret && to_write) {
+                /* start a new transaction*/
+                handle = ext4_journal_start(inode, needed_blocks);
+                if (IS_ERR(handle)) {
+                        ret = PTR_ERR(handle);
+                        goto out_writepages;
+                }
+                if (ext4_should_order_data(inode)) {
+                        /*
+                         * With ordered mode we need to add
+                         * the inode to the journal handle
+                         * when we do block allocation.
+                         */
+                        ret = ext4_jbd2_file_inode(handle, inode);
+                        if (ret) {
+                                ext4_journal_stop(handle);
+                                goto out_writepages;
+                        }
+                }
+                /*
+                 * set the max dirty pages could be write at a time
+                 * to fit into the reserved transaction credits
+                 */
+                if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES)
+                        wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;
+                to_write -= wbc->nr_to_write;
+                ret = mpage_da_writepages(mapping, wbc,
+                                                ext4_da_get_block_write);
+                ext4_journal_stop(handle);
+                if (wbc->nr_to_write) {
+                        /*
+                         * There is no more writeout needed
+                         * or we requested for a noblocking writeout
+                         * and we found the device congested
+                         */
+                        to_write += wbc->nr_to_write;
+                        break;
+                }
+                wbc->nr_to_write = to_write;
+        }
+out_writepages:
+        wbc->nr_to_write = to_write;
+        if (range_start)
+                wbc->range_start = range_start;
+        return ret;
+}
+static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
+                                loff_t pos, unsigned len, unsigned flags,
+                                struct page **pagep, void **fsdata)
+{
+        int ret, retries = 0;
+        struct page *page;
+        pgoff_t index;
+        unsigned from, to;
+        struct inode *inode = mapping->host;
+        handle_t *handle;
+        index = pos >> PAGE_CACHE_SHIFT;
+        from = pos & (PAGE_CACHE_SIZE - 1);
+        to = from + len;
+retry:
+        /*
+         * With delayed allocation, we don't log the i_disksize update
+         * if there is delayed block allocation. But we still need
+         * to journalling the i_disksize update if writes to the end
+         * of file which has an already mapped buffer.
+         */
+        handle = ext4_journal_start(inode, 1);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                goto out;
+        }
+        page = __grab_cache_page(mapping, index);
+        if (!page)
+                return -ENOMEM;
+        *pagep = page;
+        ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+                                                        ext4_da_get_block_prep);
+        if (ret < 0) {
+                unlock_page(page);
+                ext4_journal_stop(handle);
+                page_cache_release(page);
+        }
+        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+                goto retry;
+out:
+        return ret;
+}
+/*
+ * Check if we should update i_disksize
+ * when write to the end of file but not require block allocation
+ */
+static int ext4_da_should_update_i_disksize(struct page *page,
+                                         unsigned long offset)
+{
+        struct buffer_head *bh;
+        struct inode *inode = page->mapping->host;
+        unsigned int idx;
+        int i;
+        bh = page_buffers(page);
+        idx = offset >> inode->i_blkbits;
+        for (i=0; i < idx; i++)
+                bh = bh->b_this_page;
+        if (!buffer_mapped(bh) || (buffer_delay(bh)))
+                return 0;
+        return 1;
+}
+static int ext4_da_write_end(struct file *file,
+                                struct address_space *mapping,
+                                loff_t pos, unsigned len, unsigned copied,
+                                struct page *page, void *fsdata)
+{
+        struct inode *inode = mapping->host;
+        int ret = 0, ret2;
+        handle_t *handle = ext4_journal_current_handle();
+        loff_t new_i_size;
+        unsigned long start, end;
+        start = pos & (PAGE_CACHE_SIZE - 1);
+        end = start + copied -1;
+        /*
+         * generic_write_end() will run mark_inode_dirty() if i_size
+         * changes.  So let's piggyback the i_disksize mark_inode_dirty
+         * into that.
+         */
+        new_i_size = pos + copied;
+        if (new_i_size > EXT4_I(inode)->i_disksize) {
+                if (ext4_da_should_update_i_disksize(page, end)) {
+                        down_write(&EXT4_I(inode)->i_data_sem);
+                        if (new_i_size > EXT4_I(inode)->i_disksize) {
+                                /*
+                                 * Updating i_disksize when extending file
+                                 * without needing block allocation
+                                 */
+                                if (ext4_should_order_data(inode))
+                                        ret = ext4_jbd2_file_inode(handle,
+                                                                   inode);
+                                EXT4_I(inode)->i_disksize = new_i_size;
+                        }
+                        up_write(&EXT4_I(inode)->i_data_sem);
+                }
+        }
+        ret2 = generic_write_end(file, mapping, pos, len, copied,
+                                                        page, fsdata);
+        copied = ret2;
+        if (ret2 < 0)
+                ret = ret2;
+        ret2 = ext4_journal_stop(handle);
+        if (!ret)
+                ret = ret2;
+        return ret ? ret : copied;
+}
+static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
+{
+        /*
+         * Drop reserved blocks
+         */
+        BUG_ON(!PageLocked(page));
+        if (!page_has_buffers(page))
+                goto out;
+        ext4_da_page_release_reservation(page, offset);
+out:
+        ext4_invalidatepage(page, offset);
+        return;
+}
 /*
 * bmap() is special.  It gets used by applications such as lilo and by
@@ -1418,6 +2409,16 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
        journal_t *journal;
        int err;
+        if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
+                        test_opt(inode->i_sb, DELALLOC)) {
+                /*
+                 * With delalloc we want to sync the file
+                 * so that we can make sure we allocate
+                 * blocks for file
+                 */
+                filemap_write_and_wait(mapping);
+        }
        if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
                /*
                 * This is a REALLY heavyweight approach, but the use of
@@ -1462,21 +2463,17 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
        return 0;
 }
-static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
-{
-        if (buffer_mapped(bh))
-                return ext4_journal_dirty_data(handle, bh);
-        return 0;
-}
 /*
- * Note that we always start a transaction even if we're not journalling
+ * Note that we don't need to start a transaction unless we're journaling data
- * data.  This is to preserve ordering: any hole instantiation within
+ * because we should have holes filled from ext4_page_mkwrite(). We even don't
- * __block_write_full_page -> ext4_get_block() should be journalled
+ * need to file the inode to the transaction's list in ordered mode because if
- * along with the data so we don't crash and then get metadata which
+ * we are writing back data added by write(), the inode is already there and if
- * refers to old data.
+ * we are writing back data modified via mmap(), noone guarantees in which
+ * transaction the data will hit the disk. In case we are journaling data, we
+ * cannot start transaction directly because transaction start ranks above page
+ * lock so we have to do some magic.
 *
- * In all journalling modes block_write_full_page() will start the I/O.
+ * In all journaling modes block_write_full_page() will start the I/O.
 *
 * Problem:
 *
@@ -1518,105 +2515,103 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
 * disastrous.  Any write() or metadata operation will sync the fs for
 * us.
 *
- * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
- * we don't need to open a transaction here.
 */
-static int ext4_ordered_writepage(struct page *page,
+static int __ext4_normal_writepage(struct page *page,
                                struct writeback_control *wbc)
 {
        struct inode *inode = page->mapping->host;
-        struct buffer_head *page_bufs;
-        handle_t *handle = NULL;
-        int ret = 0;
-        int err;
-        J_ASSERT(PageLocked(page));
-        /*
-         * We give up here if we're reentered, because it might be for a
-         * different filesystem.
-         */
-        if (ext4_journal_current_handle())
-                goto out_fail;
-        handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+        if (test_opt(inode->i_sb, NOBH))
+                return nobh_writepage(page,
+                                        ext4_normal_get_block_write, wbc);
+        else
+                return block_write_full_page(page,
+                                                ext4_normal_get_block_write,
+                                                wbc);
+}
-        if (IS_ERR(handle)) {
+static int ext4_normal_writepage(struct page *page,
-                ret = PTR_ERR(handle);
+                                struct writeback_control *wbc)
-                goto out_fail;
+{
-        }
+        struct inode *inode = page->mapping->host;
+        loff_t size = i_size_read(inode);
+        loff_t len;
-        if (!page_has_buffers(page)) {
+        J_ASSERT(PageLocked(page));
-                create_empty_buffers(page, inode->i_sb->s_blocksize,
+        if (page->index == size >> PAGE_CACHE_SHIFT)
-                                (1 << BH_Dirty)|(1 << BH_Uptodate));
+                len = size & ~PAGE_CACHE_MASK;
+        else
+                len = PAGE_CACHE_SIZE;
+        if (page_has_buffers(page)) {
+                /* if page has buffers it should all be mapped
+                 * and allocated. If there are not buffers attached
+                 * to the page we know the page is dirty but it lost
+                 * buffers. That means that at some moment in time
+                 * after write_begin() / write_end() has been called
+                 * all buffers have been clean and thus they must have been
+                 * written at least once. So they are all mapped and we can
+                 * happily proceed with mapping them and writing the page.
+                 */
+                BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+                                        ext4_bh_unmapped_or_delay));
        }
-        page_bufs = page_buffers(page);
-        walk_page_buffers(handle, page_bufs, 0,
-                        PAGE_CACHE_SIZE, NULL, bget_one);
-        ret = block_write_full_page(page, ext4_get_block, wbc);
-        /*
+        if (!ext4_journal_current_handle())
-         * The page can become unlocked at any point now, and
+                return __ext4_normal_writepage(page, wbc);
-         * truncate can then come in and change things.  So we
-         * can't touch *page from now on.  But *page_bufs is
-         * safe due to elevated refcount.
-         */
-        /*
-         * And attach them to the current transaction.  But only if
-         * block_write_full_page() succeeded.  Otherwise they are unmapped,
-         * and generally junk.
-         */
-        if (ret == 0) {
-                err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
-                                        NULL, jbd2_journal_dirty_data_fn);
-                if (!ret)
-                        ret = err;
-        }
-        walk_page_buffers(handle, page_bufs, 0,
-                        PAGE_CACHE_SIZE, NULL, bput_one);
-        err = ext4_journal_stop(handle);
-        if (!ret)
-                ret = err;
-        return ret;
-out_fail:
        redirty_page_for_writepage(wbc, page);
        unlock_page(page);
-        return ret;
+        return 0;
 }
-static int ext4_writeback_writepage(struct page *page,
+static int __ext4_journalled_writepage(struct page *page,
                                struct writeback_control *wbc)
 {
-        struct inode *inode = page->mapping->host;
+        struct address_space *mapping = page->mapping;
+        struct inode *inode = mapping->host;
+        struct buffer_head *page_bufs;
        handle_t *handle = NULL;
        int ret = 0;
        int err;
-        if (ext4_journal_current_handle())
+        ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
-                goto out_fail;
+                                        ext4_normal_get_block_write);
+        if (ret != 0)
+                goto out_unlock;
+        page_bufs = page_buffers(page);
+        walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
+                                                                bget_one);
+        /* As soon as we unlock the page, it can go away, but we have
+         * references to buffers so we are safe */
+        unlock_page(page);
        handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
-                goto out_fail;
+                goto out;
        }
-        if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
+        ret = walk_page_buffers(handle, page_bufs, 0,
-                ret = nobh_writepage(page, ext4_get_block, wbc);
+                        PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
-        else
-                ret = block_write_full_page(page, ext4_get_block, wbc);
+        err = walk_page_buffers(handle, page_bufs, 0,
+                                PAGE_CACHE_SIZE, NULL, write_end_fn);
+        if (ret == 0)
+                ret = err;
        err = ext4_journal_stop(handle);
        if (!ret)
                ret = err;
-        return ret;
-out_fail:
+        walk_page_buffers(handle, page_bufs, 0,
-        redirty_page_for_writepage(wbc, page);
+                                PAGE_CACHE_SIZE, NULL, bput_one);
+        EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
+        goto out;
+out_unlock:
        unlock_page(page);
+out:
        return ret;
 }
@@ -1624,59 +2619,53 @@ static int ext4_journalled_writepage(struct page *page,
                                struct writeback_control *wbc)
 {
        struct inode *inode = page->mapping->host;
-        handle_t *handle = NULL;
+        loff_t size = i_size_read(inode);
-        int ret = 0;
+        loff_t len;
-        int err;
-        if (ext4_journal_current_handle())
+        J_ASSERT(PageLocked(page));
-                goto no_write;
+        if (page->index == size >> PAGE_CACHE_SHIFT)
+                len = size & ~PAGE_CACHE_MASK;
+        else
+                len = PAGE_CACHE_SIZE;
+        if (page_has_buffers(page)) {
+                /* if page has buffers it should all be mapped
+                 * and allocated. If there are not buffers attached
+                 * to the page we know the page is dirty but it lost
+                 * buffers. That means that at some moment in time
+                 * after write_begin() / write_end() has been called
+                 * all buffers have been clean and thus they must have been
+                 * written at least once. So they are all mapped and we can
+                 * happily proceed with mapping them and writing the page.
+                 */
+                BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+                                        ext4_bh_unmapped_or_delay));
+        }
-        handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+        if (ext4_journal_current_handle())
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
                goto no_write;
-        }
-        if (!page_has_buffers(page) || PageChecked(page)) {
+        if (PageChecked(page)) {
                /*
                 * It's mmapped pagecache.  Add buffers and journal it.  There
                 * doesn't seem much point in redirtying the page here.
                 */
                ClearPageChecked(page);
-                ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
+                return __ext4_journalled_writepage(page, wbc);
-                                        ext4_get_block);
-                if (ret != 0) {
-                        ext4_journal_stop(handle);
-                        goto out_unlock;
-                }
-                ret = walk_page_buffers(handle, page_buffers(page), 0,
-                        PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
-                err = walk_page_buffers(handle, page_buffers(page), 0,
-                                PAGE_CACHE_SIZE, NULL, write_end_fn);
-                if (ret == 0)
-                        ret = err;
-                EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
-                unlock_page(page);
        } else {
                /*
                 * It may be a page full of checkpoint-mode buffers.  We don't
                 * really know unless we go poke around in the buffer_heads.
                 * But block_write_full_page will do the right thing.
                 */
-                ret = block_write_full_page(page, ext4_get_block, wbc);
+                return block_write_full_page(page,
+                                                ext4_normal_get_block_write,
+                                                wbc);
        }
-        err = ext4_journal_stop(handle);
-        if (!ret)
-                ret = err;
-out:
-        return ret;
 no_write:
        redirty_page_for_writepage(wbc, page);
-out_unlock:
        unlock_page(page);
-        goto out;
+        return 0;
 }
 static int ext4_readpage(struct file *file, struct page *page)
@@ -1819,7 +2808,7 @@ static int ext4_journalled_set_page_dirty(struct page *page)
 static const struct address_space_operations ext4_ordered_aops = {
        .readpage       = ext4_readpage,
        .readpages      = ext4_readpages,
-        .writepage      = ext4_ordered_writepage,
+        .writepage      = ext4_normal_writepage,
        .sync_page      = block_sync_page,
        .write_begin    = ext4_write_begin,
        .write_end      = ext4_ordered_write_end,
@@ -1833,7 +2822,7 @@ static const struct address_space_operations ext4_ordered_aops = {
 static const struct address_space_operations ext4_writeback_aops = {
        .readpage       = ext4_readpage,
        .readpages      = ext4_readpages,
-        .writepage      = ext4_writeback_writepage,
+        .writepage      = ext4_normal_writepage,
        .sync_page      = block_sync_page,
        .write_begin    = ext4_write_begin,
        .write_end      = ext4_writeback_write_end,
@@ -1857,10 +2846,31 @@ static const struct address_space_operations ext4_journalled_aops = {
        .releasepage    = ext4_releasepage,
 };
+static const struct address_space_operations ext4_da_aops = {
+        .readpage       = ext4_readpage,
+        .readpages      = ext4_readpages,
+        .writepage      = ext4_da_writepage,
+        .writepages     = ext4_da_writepages,
+        .sync_page      = block_sync_page,
+        .write_begin    = ext4_da_write_begin,
+        .write_end      = ext4_da_write_end,
+        .bmap           = ext4_bmap,
+        .invalidatepage = ext4_da_invalidatepage,
+        .releasepage    = ext4_releasepage,
+        .direct_IO      = ext4_direct_IO,
+        .migratepage    = buffer_migrate_page,
+};
 void ext4_set_aops(struct inode *inode)
 {
-        if (ext4_should_order_data(inode))
+        if (ext4_should_order_data(inode) &&
+                test_opt(inode->i_sb, DELALLOC))
+                inode->i_mapping->a_ops = &ext4_da_aops;
+        else if (ext4_should_order_data(inode))
                inode->i_mapping->a_ops = &ext4_ordered_aops;
+        else if (ext4_should_writeback_data(inode) &&
+                 test_opt(inode->i_sb, DELALLOC))
+                inode->i_mapping->a_ops = &ext4_da_aops;
        else if (ext4_should_writeback_data(inode))
                inode->i_mapping->a_ops = &ext4_writeback_aops;
        else
@@ -1873,7 +2883,7 @@ void ext4_set_aops(struct inode *inode)
 * This required during truncate. We need to physically zero the tail end
 * of that block so it doesn't yield old data if the file is later grown.
 */
-int ext4_block_truncate_page(handle_t *handle, struct page *page,
+int ext4_block_truncate_page(handle_t *handle,
                struct address_space *mapping, loff_t from)
 {
        ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
@@ -1882,8 +2892,13 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
        ext4_lblk_t iblock;
        struct inode *inode = mapping->host;
        struct buffer_head *bh;
+        struct page *page;
        int err = 0;
+        page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT);
+        if (!page)
+                return -EINVAL;
        blocksize = inode->i_sb->s_blocksize;
        length = blocksize - (offset & (blocksize - 1));
        iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
@@ -1956,7 +2971,7 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
                err = ext4_journal_dirty_metadata(handle, bh);
        } else {
                if (ext4_should_order_data(inode))
-                        err = ext4_journal_dirty_data(handle, bh);
+                        err = ext4_jbd2_file_inode(handle, inode);
                mark_buffer_dirty(bh);
        }
@@ -2179,7 +3194,21 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
        if (this_bh) {
                BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata");
-                ext4_journal_dirty_metadata(handle, this_bh);
+                /*
+                 * The buffer head should have an attached journal head at this
+                 * point. However, if the data is corrupted and an indirect
+                 * block pointed to itself, it would have been detached when
+                 * the block was cleared. Check for this instead of OOPSing.
+                 */
+                if (bh2jh(this_bh))
+                        ext4_journal_dirty_metadata(handle, this_bh);
+                else
+                        ext4_error(inode->i_sb, __func__,
+                                   "circular indirect block detected, "
+                                   "inode=%lu, block=%llu",
+                                   inode->i_ino,
+                                   (unsigned long long) this_bh->b_blocknr);
        }
 }
@@ -2305,6 +3334,19 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
        }
 }
+int ext4_can_truncate(struct inode *inode)
+{
+        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+                return 0;
+        if (S_ISREG(inode->i_mode))
+                return 1;
+        if (S_ISDIR(inode->i_mode))
+                return 1;
+        if (S_ISLNK(inode->i_mode))
+                return !ext4_inode_is_fast_symlink(inode);
+        return 0;
+}
 /*
 * ext4_truncate()
 *
@@ -2347,51 +3389,25 @@ void ext4_truncate(struct inode *inode)
        int n;
        ext4_lblk_t last_block;
        unsigned blocksize = inode->i_sb->s_blocksize;
-        struct page *page;
-        if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+        if (!ext4_can_truncate(inode))
-            S_ISLNK(inode->i_mode)))
-                return;
-        if (ext4_inode_is_fast_symlink(inode))
-                return;
-        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
                return;
-        /*
-         * We have to lock the EOF page here, because lock_page() nests
-         * outside jbd2_journal_start().
-         */
-        if ((inode->i_size & (blocksize - 1)) == 0) {
-                /* Block boundary? Nothing to do */
-                page = NULL;
-        } else {
-                page = grab_cache_page(mapping,
-                                inode->i_size >> PAGE_CACHE_SHIFT);
-                if (!page)
-                        return;
-        }
        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
-                ext4_ext_truncate(inode, page);
+                ext4_ext_truncate(inode);
                return;
        }
        handle = start_transaction(inode);
-        if (IS_ERR(handle)) {
+        if (IS_ERR(handle))
-                if (page) {
-                        clear_highpage(page);
-                        flush_dcache_page(page);
-                        unlock_page(page);
-                        page_cache_release(page);
-                }
                return;         /* AKPM: return what? */
-        }
        last_block = (inode->i_size + blocksize-1)
                                        >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
-        if (page)
+        if (inode->i_size & (blocksize - 1))
-                ext4_block_truncate_page(handle, page, mapping, inode->i_size);
+                if (ext4_block_truncate_page(handle, mapping, inode->i_size))
+                        goto out_stop;
        n = ext4_block_to_path(inode, last_block, offsets, NULL);
        if (n == 0)
@@ -2410,6 +3426,11 @@ void ext4_truncate(struct inode *inode)
                goto out_stop;
        /*
+         * From here we block out all ext4_get_block() callers who want to
+         * modify the block allocation tree.
+         */
+        down_write(&ei->i_data_sem);
+        /*
         * The orphan list entry will now protect us from any crash which
         * occurs before the truncate completes, so it is now safe to propagate
         * the new, shorter inode size (held for now in i_size) into the
@@ -2418,12 +3439,6 @@ void ext4_truncate(struct inode *inode)
         */
        ei->i_disksize = inode->i_size;
-        /*
-         * From here we block out all ext4_get_block() callers who want to
-         * modify the block allocation tree.
-         */
-        down_write(&ei->i_data_sem);
        if (n == 1) {           /* direct blocks */
                ext4_free_data(handle, inode, NULL, i_data+offsets[0],
                               i_data + EXT4_NDIR_BLOCKS);
@@ -3107,7 +4122,14 @@ int ext4_write_inode(struct inode *inode, int wait)
 * be freed, so we have a strong guarantee that no future commit will
 * leave these blocks visible to the user.)
 *
- * Called with inode->sem down.
+ * Another thing we have to assure is that if we are in ordered mode
+ * and inode is still attached to the committing transaction, we must
+ * we start writeout of all the dirty pages which are being truncated.
+ * This way we are sure that all the data written in the previous
+ * transaction are already on disk (truncate waits for pages under
+ * writeback).
+ *
+ * Called with inode->i_mutex down.
 */
 int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 {
@@ -3173,6 +4195,22 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                if (!error)
                        error = rc;
                ext4_journal_stop(handle);
+                if (ext4_should_order_data(inode)) {
+                        error = ext4_begin_ordered_truncate(inode,
+                                                            attr->ia_size);
+                        if (error) {
+                                /* Do as much error cleanup as possible */
+                                handle = ext4_journal_start(inode, 3);
+                                if (IS_ERR(handle)) {
+                                        ext4_orphan_del(NULL, inode);
+                                        goto err_out;
+                                }
+                                ext4_orphan_del(handle, inode);
+                                ext4_journal_stop(handle);
+                                goto err_out;
+                        }
+                }
        }
        rc = inode_setattr(inode, attr);
@@ -3193,6 +4231,32 @@ err_out:
        return error;
 }
+int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
+                 struct kstat *stat)
+{
+        struct inode *inode;
+        unsigned long delalloc_blocks;
+        inode = dentry->d_inode;
+        generic_fillattr(inode, stat);
+        /*
+         * We can't update i_blocks if the block allocation is delayed
+         * otherwise in the case of system crash before the real block
+         * allocation is done, we will have i_blocks inconsistent with
+         * on-disk file blocks.
+         * We always keep i_blocks updated together with real
+         * allocation. But to not confuse with user, stat
+         * will return the blocks that include the delayed allocation
+         * blocks for this file.
+         */
+        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+        delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
+        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+        stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
+        return 0;
+}
 /*
 * How many blocks doth make a writepage()?
@@ -3506,3 +4570,64 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
        return err;
 }
+static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
+{
+        return !buffer_mapped(bh);
+}
+int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+        loff_t size;
+        unsigned long len;
+        int ret = -EINVAL;
+        struct file *file = vma->vm_file;
+        struct inode *inode = file->f_path.dentry->d_inode;
+        struct address_space *mapping = inode->i_mapping;
+        /*
+         * Get i_alloc_sem to stop truncates messing with the inode. We cannot
+         * get i_mutex because we are already holding mmap_sem.
+         */
+        down_read(&inode->i_alloc_sem);
+        size = i_size_read(inode);
+        if (page->mapping != mapping || size <= page_offset(page)
+            || !PageUptodate(page)) {
+                /* page got truncated from under us? */
+                goto out_unlock;
+        }
+        ret = 0;
+        if (PageMappedToDisk(page))
+                goto out_unlock;
+        if (page->index == size >> PAGE_CACHE_SHIFT)
+                len = size & ~PAGE_CACHE_MASK;
+        else
+                len = PAGE_CACHE_SIZE;
+        if (page_has_buffers(page)) {
+                /* return if we have all the buffers mapped */
+                if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+                                       ext4_bh_unmapped))
+                        goto out_unlock;
+        }
+        /*
+         * OK, we need to fill the hole... Do write_begin write_end
+         * to do block allocation/reservation.We are not holding
+         * inode.i__mutex here. That allow * parallel write_begin,
+         * write_end call. lock_page prevent this from happening
+         * on the same page though
+         */
+        ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
+                        len, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
+        if (ret < 0)
+                goto out_unlock;
+        ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
+                        len, len, page, NULL);
+        if (ret < 0)
+                goto out_unlock;
+        ret = 0;
+out_unlock:
+        up_read(&inode->i_alloc_sem);
+        return ret;
+}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index c9900aade150..8d141a25bbee 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -381,22 +381,28 @@ static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
 static inline int mb_find_next_zero_bit(void *addr, int max, int start)
 {
-        int fix = 0;
+        int fix = 0, ret, tmpmax;
        addr = mb_correct_addr_and_bit(&fix, addr);
-        max += fix;
+        tmpmax = max + fix;
        start += fix;
-        return ext4_find_next_zero_bit(addr, max, start) - fix;
+        ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
+        if (ret > max)
+                return max;
+        return ret;
 }
 static inline int mb_find_next_bit(void *addr, int max, int start)
 {
-        int fix = 0;
+        int fix = 0, ret, tmpmax;
        addr = mb_correct_addr_and_bit(&fix, addr);
-        max += fix;
+        tmpmax = max + fix;
        start += fix;
-        return ext4_find_next_bit(addr, max, start) - fix;
+        ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
+        if (ret > max)
+                return max;
+        return ret;
 }
 static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
@@ -803,6 +809,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
                if (!buffer_uptodate(bh[i]))
                        goto out;
+        err = 0;
        first_block = page->index * blocks_per_page;
        for (i = 0; i < blocks_per_page; i++) {
                int group;
@@ -883,6 +890,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
        int pnum;
        int poff;
        struct page *page;
+        int ret;
        mb_debug("load group %lu\n", group);
@@ -914,15 +922,21 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
                if (page) {
                        BUG_ON(page->mapping != inode->i_mapping);
                        if (!PageUptodate(page)) {
-                                ext4_mb_init_cache(page, NULL);
+                                ret = ext4_mb_init_cache(page, NULL);
+                                if (ret) {
+                                        unlock_page(page);
+                                        goto err;
+                                }
                                mb_cmp_bitmaps(e4b, page_address(page) +
                                               (poff * sb->s_blocksize));
                        }
                        unlock_page(page);
                }
        }
-        if (page == NULL || !PageUptodate(page))
+        if (page == NULL || !PageUptodate(page)) {
+                ret = -EIO;
                goto err;
+        }
        e4b->bd_bitmap_page = page;
        e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
        mark_page_accessed(page);
@@ -938,14 +952,20 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
                page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
                if (page) {
                        BUG_ON(page->mapping != inode->i_mapping);
-                        if (!PageUptodate(page))
+                        if (!PageUptodate(page)) {
-                                ext4_mb_init_cache(page, e4b->bd_bitmap);
+                                ret = ext4_mb_init_cache(page, e4b->bd_bitmap);
+                                if (ret) {
+                                        unlock_page(page);
+                                        goto err;
+                                }
+                        }
                        unlock_page(page);
                }
        }
-        if (page == NULL || !PageUptodate(page))
+        if (page == NULL || !PageUptodate(page)) {
+                ret = -EIO;
                goto err;
+        }
        e4b->bd_buddy_page = page;
        e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
        mark_page_accessed(page);
@@ -962,7 +982,7 @@ err:
                page_cache_release(e4b->bd_buddy_page);
        e4b->bd_buddy = NULL;
        e4b->bd_bitmap = NULL;
-        return -EIO;
+        return ret;
 }
 static void ext4_mb_release_desc(struct ext4_buddy *e4b)
@@ -1031,7 +1051,7 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
        }
 }
-static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
+static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
                          int first, int count)
 {
        int block = 0;
@@ -1071,11 +1091,12 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
                        blocknr += block;
                        blocknr +=
                            le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
+                        ext4_unlock_group(sb, e4b->bd_group);
                        ext4_error(sb, __func__, "double-free of inode"
                                   " %lu's block %llu(bit %u in group %lu)\n",
                                   inode ? inode->i_ino : 0, blocknr, block,
                                   e4b->bd_group);
+                        ext4_lock_group(sb, e4b->bd_group);
                }
                mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
                e4b->bd_info->bb_counters[order]++;
@@ -1113,8 +1134,6 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
                } while (1);
        }
        mb_check_buddy(e4b);
-        return 0;
 }
 static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
@@ -1730,10 +1749,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
                ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
                spin_unlock(&sbi->s_md_lock);
        }
-        /* searching for the right group start from the goal value specified */
-        group = ac->ac_g_ex.fe_group;
        /* Let's just scan groups to find more-less suitable blocks */
        cr = ac->ac_2order ? 0 : 1;
        /*
@@ -1743,6 +1758,12 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 repeat:
        for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
                ac->ac_criteria = cr;
+                /*
+                 * searching for the right group start
+                 * from the goal value specified
+                 */
+                group = ac->ac_g_ex.fe_group;
                for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) {
                        struct ext4_group_info *grp;
                        struct ext4_group_desc *desc;
@@ -1963,6 +1984,8 @@ static int ext4_mb_seq_history_open(struct inode *inode, struct file *file)
        int rc;
        int size;
+        if (unlikely(sbi->s_mb_history == NULL))
+                return -ENOMEM;
        s = kmalloc(sizeof(*s), GFP_KERNEL);
        if (s == NULL)
                return -ENOMEM;
@@ -2165,9 +2188,7 @@ static void ext4_mb_history_init(struct super_block *sb)
        sbi->s_mb_history_cur = 0;
        spin_lock_init(&sbi->s_mb_history_lock);
        i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
-        sbi->s_mb_history = kmalloc(i, GFP_KERNEL);
+        sbi->s_mb_history = kzalloc(i, GFP_KERNEL);
-        if (likely(sbi->s_mb_history != NULL))
-                memset(sbi->s_mb_history, 0, i);
        /* if we can't allocate history, then we simple won't use it */
 }
@@ -2215,21 +2236,192 @@ ext4_mb_store_history(struct ext4_allocation_context *ac)
 #define ext4_mb_history_init(sb)
 #endif
+/* Create and initialize ext4_group_info data for the given group. */
+int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
+                          struct ext4_group_desc *desc)
+{
+        int i, len;
+        int metalen = 0;
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_group_info **meta_group_info;
+        /*
+         * First check if this group is the first of a reserved block.
+         * If it's true, we have to allocate a new table of pointers
+         * to ext4_group_info structures
+         */
+        if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
+                metalen = sizeof(*meta_group_info) <<
+                        EXT4_DESC_PER_BLOCK_BITS(sb);
+                meta_group_info = kmalloc(metalen, GFP_KERNEL);
+                if (meta_group_info == NULL) {
+                        printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
+                               "buddy group\n");
+                        goto exit_meta_group_info;
+                }
+                sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
+                        meta_group_info;
+        }
+        /*
+         * calculate needed size. if change bb_counters size,
+         * don't forget about ext4_mb_generate_buddy()
+         */
+        len = offsetof(typeof(**meta_group_info),
+                       bb_counters[sb->s_blocksize_bits + 2]);
+        meta_group_info =
+                sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
+        i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
+        meta_group_info[i] = kzalloc(len, GFP_KERNEL);
+        if (meta_group_info[i] == NULL) {
+                printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
+                goto exit_group_info;
+        }
+        set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
+                &(meta_group_info[i]->bb_state));
+        /*
+         * initialize bb_free to be able to skip
+         * empty groups without initialization
+         */
+        if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+                meta_group_info[i]->bb_free =
+                        ext4_free_blocks_after_init(sb, group, desc);
+        } else {
+                meta_group_info[i]->bb_free =
+                        le16_to_cpu(desc->bg_free_blocks_count);
+        }
+        INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
+#ifdef DOUBLE_CHECK
+        {
+                struct buffer_head *bh;
+                meta_group_info[i]->bb_bitmap =
+                        kmalloc(sb->s_blocksize, GFP_KERNEL);
+                BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
+                bh = ext4_read_block_bitmap(sb, group);
+                BUG_ON(bh == NULL);
+                memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
+                        sb->s_blocksize);
+                put_bh(bh);
+        }
+#endif
+        return 0;
+exit_group_info:
+        /* If a meta_group_info table has been allocated, release it now */
+        if (group % EXT4_DESC_PER_BLOCK(sb) == 0)
+                kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
+exit_meta_group_info:
+        return -ENOMEM;
+} /* ext4_mb_add_groupinfo */
+/*
+ * Add a group to the existing groups.
+ * This function is used for online resize
+ */
+int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group,
+                               struct ext4_group_desc *desc)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct inode *inode = sbi->s_buddy_cache;
+        int blocks_per_page;
+        int block;
+        int pnum;
+        struct page *page;
+        int err;
+        /* Add group based on group descriptor*/
+        err = ext4_mb_add_groupinfo(sb, group, desc);
+        if (err)
+                return err;
+        /*
+         * Cache pages containing dynamic mb_alloc datas (buddy and bitmap
+         * datas) are set not up to date so that they will be re-initilaized
+         * during the next call to ext4_mb_load_buddy
+         */
+        /* Set buddy page as not up to date */
+        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+        block = group * 2;
+        pnum = block / blocks_per_page;
+        page = find_get_page(inode->i_mapping, pnum);
+        if (page != NULL) {
+                ClearPageUptodate(page);
+                page_cache_release(page);
+        }
+        /* Set bitmap page as not up to date */
+        block++;
+        pnum = block / blocks_per_page;
+        page = find_get_page(inode->i_mapping, pnum);
+        if (page != NULL) {
+                ClearPageUptodate(page);
+                page_cache_release(page);
+        }
+        return 0;
+}
+/*
+ * Update an existing group.
+ * This function is used for online resize
+ */
+void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)
+{
+        grp->bb_free += add;
+}
 static int ext4_mb_init_backend(struct super_block *sb)
 {
        ext4_group_t i;
-        int j, len, metalen;
+        int metalen;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        int num_meta_group_infos =
+        struct ext4_super_block *es = sbi->s_es;
-                (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) >>
+        int num_meta_group_infos;
-                        EXT4_DESC_PER_BLOCK_BITS(sb);
+        int num_meta_group_infos_max;
+        int array_size;
        struct ext4_group_info **meta_group_info;
+        struct ext4_group_desc *desc;
+        /* This is the number of blocks used by GDT */
+        num_meta_group_infos = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) -
+                                1) >> EXT4_DESC_PER_BLOCK_BITS(sb);
+        /*
+         * This is the total number of blocks used by GDT including
+         * the number of reserved blocks for GDT.
+         * The s_group_info array is allocated with this value
+         * to allow a clean online resize without a complex
+         * manipulation of pointer.
+         * The drawback is the unused memory when no resize
+         * occurs but it's very low in terms of pages
+         * (see comments below)
+         * Need to handle this properly when META_BG resizing is allowed
+         */
+        num_meta_group_infos_max = num_meta_group_infos +
+                                le16_to_cpu(es->s_reserved_gdt_blocks);
+        /*
+         * array_size is the size of s_group_info array. We round it
+         * to the next power of two because this approximation is done
+         * internally by kmalloc so we can have some more memory
+         * for free here (e.g. may be used for META_BG resize).
+         */
+        array_size = 1;
+        while (array_size < sizeof(*sbi->s_group_info) *
+               num_meta_group_infos_max)
+                array_size = array_size << 1;
        /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
         * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
         * So a two level scheme suffices for now. */
-        sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) *
+        sbi->s_group_info = kmalloc(array_size, GFP_KERNEL);
-                                    num_meta_group_infos, GFP_KERNEL);
        if (sbi->s_group_info == NULL) {
                printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
                return -ENOMEM;
@@ -2256,63 +2448,15 @@ static int ext4_mb_init_backend(struct super_block *sb)
                sbi->s_group_info[i] = meta_group_info;
        }
-        /*
-         * calculate needed size. if change bb_counters size,
-         * don't forget about ext4_mb_generate_buddy()
-         */
-        len = sizeof(struct ext4_group_info);
-        len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2);
        for (i = 0; i < sbi->s_groups_count; i++) {
-                struct ext4_group_desc *desc;
-                meta_group_info =
-                        sbi->s_group_info[i >> EXT4_DESC_PER_BLOCK_BITS(sb)];
-                j = i & (EXT4_DESC_PER_BLOCK(sb) - 1);
-                meta_group_info[j] = kzalloc(len, GFP_KERNEL);
-                if (meta_group_info[j] == NULL) {
-                        printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
-                        goto err_freebuddy;
-                }
                desc = ext4_get_group_desc(sb, i, NULL);
                if (desc == NULL) {
                        printk(KERN_ERR
                                "EXT4-fs: can't read descriptor %lu\n", i);
-                        i++;
                        goto err_freebuddy;
                }
-                memset(meta_group_info[j], 0, len);
+                if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
-                set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
+                        goto err_freebuddy;
-                        &(meta_group_info[j]->bb_state));
-                /*
-                 * initialize bb_free to be able to skip
-                 * empty groups without initialization
-                 */
-                if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
-                        meta_group_info[j]->bb_free =
-                                ext4_free_blocks_after_init(sb, i, desc);
-                } else {
-                        meta_group_info[j]->bb_free =
-                                le16_to_cpu(desc->bg_free_blocks_count);
-                }
-                INIT_LIST_HEAD(&meta_group_info[j]->bb_prealloc_list);
-#ifdef DOUBLE_CHECK
-                {
-                        struct buffer_head *bh;
-                        meta_group_info[j]->bb_bitmap =
-                                kmalloc(sb->s_blocksize, GFP_KERNEL);
-                        BUG_ON(meta_group_info[j]->bb_bitmap == NULL);
-                        bh = read_block_bitmap(sb, i);
-                        BUG_ON(bh == NULL);
-                        memcpy(meta_group_info[j]->bb_bitmap, bh->b_data,
-                                        sb->s_blocksize);
-                        put_bh(bh);
-                }
-#endif
        }
        return 0;
@@ -2336,6 +2480,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        unsigned i;
        unsigned offset;
        unsigned max;
+        int ret;
        if (!test_opt(sb, MBALLOC))
                return 0;
@@ -2370,12 +2515,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        } while (i <= sb->s_blocksize_bits + 1);
        /* init file for buddy data */
-        i = ext4_mb_init_backend(sb);
+        ret = ext4_mb_init_backend(sb);
-        if (i) {
+        if (ret != 0) {
                clear_opt(sbi->s_mount_opt, MBALLOC);
                kfree(sbi->s_mb_offsets);
                kfree(sbi->s_mb_maxs);
-                return i;
+                return ret;
        }
        spin_lock_init(&sbi->s_md_lock);
@@ -2548,8 +2693,7 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
                ext4_lock_group(sb, md->group);
                for (i = 0; i < md->num; i++) {
                        mb_debug(" %u", md->blocks[i]);
-                        err = mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
+                        mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
-                        BUG_ON(err != 0);
                }
                mb_debug("\n");
                ext4_unlock_group(sb, md->group);
@@ -2575,25 +2719,24 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
-#define MB_PROC_VALUE_READ(name)                                \
+#define MB_PROC_FOPS(name)                                      \
-static int ext4_mb_read_##name(char *page, char **start,        \
+static int ext4_mb_##name##_proc_show(struct seq_file *m, void *v)      \
-                off_t off, int count, int *eof, void *data)     \
 {                                                               \
-        struct ext4_sb_info *sbi = data;                        \
+        struct ext4_sb_info *sbi = m->private;                  \
-        int len;                                                \
+                                                                \
-        *eof = 1;                                               \
+        seq_printf(m, "%ld\n", sbi->s_mb_##name);               \
-        if (off != 0)                                           \
+        return 0;                                               \
-                return 0;                                       \
+}                                                               \
-        len = sprintf(page, "%ld\n", sbi->s_mb_##name);         \
+                                                                \
-        *start = page;                                          \
+static int ext4_mb_##name##_proc_open(struct inode *inode, struct file *file)\
-        return len;                                             \
+{                                                               \
-}
+        return single_open(file, ext4_mb_##name##_proc_show, PDE(inode)->data);\
+}                                                               \
-#define MB_PROC_VALUE_WRITE(name)                               \
+                                                                \
-static int ext4_mb_write_##name(struct file *file,              \
+static ssize_t ext4_mb_##name##_proc_write(struct file *file,   \
-                const char __user *buf, unsigned long cnt, void *data)  \
+                const char __user *buf, size_t cnt, loff_t *ppos)       \
 {                                                               \
-        struct ext4_sb_info *sbi = data;                        \
+        struct ext4_sb_info *sbi = PDE(file->f_path.dentry->d_inode)->data;\
        char str[32];                                           \
        long value;                                             \
        if (cnt >= sizeof(str))                                 \
@@ -2605,31 +2748,32 @@ static int ext4_mb_write_##name(struct file *file,		\
                return -ERANGE;                                 \
        sbi->s_mb_##name = value;                               \
        return cnt;                                             \
-}
+}                                                               \
+                                                                \
+static const struct file_operations ext4_mb_##name##_proc_fops = {      \
+        .owner          = THIS_MODULE,                          \
+        .open           = ext4_mb_##name##_proc_open,           \
+        .read           = seq_read,                             \
+        .llseek         = seq_lseek,                            \
+        .release        = single_release,                       \
+        .write          = ext4_mb_##name##_proc_write,          \
+};
-MB_PROC_VALUE_READ(stats);
+MB_PROC_FOPS(stats);
-MB_PROC_VALUE_WRITE(stats);
+MB_PROC_FOPS(max_to_scan);
-MB_PROC_VALUE_READ(max_to_scan);
+MB_PROC_FOPS(min_to_scan);
-MB_PROC_VALUE_WRITE(max_to_scan);
+MB_PROC_FOPS(order2_reqs);
-MB_PROC_VALUE_READ(min_to_scan);
+MB_PROC_FOPS(stream_request);
-MB_PROC_VALUE_WRITE(min_to_scan);
+MB_PROC_FOPS(group_prealloc);
-MB_PROC_VALUE_READ(order2_reqs);
-MB_PROC_VALUE_WRITE(order2_reqs);
-MB_PROC_VALUE_READ(stream_request);
-MB_PROC_VALUE_WRITE(stream_request);
-MB_PROC_VALUE_READ(group_prealloc);
-MB_PROC_VALUE_WRITE(group_prealloc);
 #define MB_PROC_HANDLER(name, var)                                      \
 do {                                                                    \
-        proc = create_proc_entry(name, mode, sbi->s_mb_proc);           \
+        proc = proc_create_data(name, mode, sbi->s_mb_proc,             \
+                                &ext4_mb_##var##_proc_fops, sbi);       \
        if (proc == NULL) {                                             \
                printk(KERN_ERR "EXT4-fs: can't to create %s\n", name); \
                goto err_out;                                           \
        }                                                               \
-        proc->data = sbi;                                               \
-        proc->read_proc  = ext4_mb_read_##var ;                         \
-        proc->write_proc = ext4_mb_write_##var;                         \
 } while (0)
 static int ext4_mb_init_per_dev_proc(struct super_block *sb)
@@ -2639,6 +2783,10 @@ static int ext4_mb_init_per_dev_proc(struct super_block *sb)
        struct proc_dir_entry *proc;
        char devname[64];
+        if (proc_root_ext4 == NULL) {
+                sbi->s_mb_proc = NULL;
+                return -EINVAL;
+        }
        bdevname(sb->s_bdev, devname);
        sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4);
@@ -2747,7 +2895,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
        err = -EIO;
-        bitmap_bh = read_block_bitmap(sb, ac->ac_b_ex.fe_group);
+        bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
        if (!bitmap_bh)
                goto out_err;
@@ -2816,7 +2964,23 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
        le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len);
        gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
        spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
-        percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
+        /*
+         * free blocks account has already be reduced/reserved
+         * at write_begin() time for delayed allocation
+         * do not double accounting
+         */
+        if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
+                percpu_counter_sub(&sbi->s_freeblocks_counter,
+                                        ac->ac_b_ex.fe_len);
+        if (sbi->s_log_groups_per_flex) {
+                ext4_group_t flex_group = ext4_flex_group(sbi,
+                                                          ac->ac_b_ex.fe_group);
+                spin_lock(sb_bgl_lock(sbi, flex_group));
+                sbi->s_flex_groups[flex_group].free_blocks -= ac->ac_b_ex.fe_len;
+                spin_unlock(sb_bgl_lock(sbi, flex_group));
+        }
        err = ext4_journal_dirty_metadata(handle, bitmap_bh);
        if (err)
@@ -3473,8 +3637,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
                if (bit >= end)
                        break;
                next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
-                if (next > end)
-                        next = end;
                start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
                                le32_to_cpu(sbi->s_es->s_first_data_block);
                mb_debug("    free preallocated %u/%u in group %u\n",
@@ -3569,7 +3731,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
        if (list_empty(&grp->bb_prealloc_list))
                return 0;
-        bitmap_bh = read_block_bitmap(sb, group);
+        bitmap_bh = ext4_read_block_bitmap(sb, group);
        if (bitmap_bh == NULL) {
                /* error handling here */
                ext4_mb_release_desc(&e4b);
@@ -3743,7 +3905,7 @@ repeat:
                err = ext4_mb_load_buddy(sb, group, &e4b);
                BUG_ON(err != 0); /* error handling here */
-                bitmap_bh = read_block_bitmap(sb, group);
+                bitmap_bh = ext4_read_block_bitmap(sb, group);
                if (bitmap_bh == NULL) {
                        /* error handling here */
                        ext4_mb_release_desc(&e4b);
@@ -4011,10 +4173,21 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
        sbi = EXT4_SB(sb);
        if (!test_opt(sb, MBALLOC)) {
-                block = ext4_new_blocks_old(handle, ar->inode, ar->goal,
+                block = ext4_old_new_blocks(handle, ar->inode, ar->goal,
                                            &(ar->len), errp);
                return block;
        }
+        if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) {
+                /*
+                 * With delalloc we already reserved the blocks
+                 */
+                ar->len = ext4_has_free_blocks(sbi, ar->len);
+        }
+        if (ar->len == 0) {
+                *errp = -ENOSPC;
+                return 0;
+        }
        while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
                ar->flags |= EXT4_MB_HINT_NOPREALLOC;
@@ -4026,10 +4199,14 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
        }
        inquota = ar->len;
+        if (EXT4_I(ar->inode)->i_delalloc_reserved_flag)
+                ar->flags |= EXT4_MB_DELALLOC_RESERVED;
        ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
        if (!ac) {
+                ar->len = 0;
                *errp = -ENOMEM;
-                return 0;
+                goto out1;
        }
        ext4_mb_poll_new_transaction(sb, handle);
@@ -4037,12 +4214,11 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
        *errp = ext4_mb_initialize_context(ac, ar);
        if (*errp) {
                ar->len = 0;
-                goto out;
+                goto out2;
        }
        ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
        if (!ext4_mb_use_preallocated(ac)) {
                ac->ac_op = EXT4_MB_HISTORY_ALLOC;
                ext4_mb_normalize_request(ac, ar);
 repeat:
@@ -4085,11 +4261,12 @@ repeat:
        ext4_mb_release_context(ac);
-out:
+out2:
+        kmem_cache_free(ext4_ac_cachep, ac);
+out1:
        if (ar->len < inquota)
                DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len);
-        kmem_cache_free(ext4_ac_cachep, ac);
        return block;
 }
 static void ext4_mb_poll_new_transaction(struct super_block *sb,
@@ -4242,7 +4419,7 @@ do_more:
                overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
                count -= overflow;
        }
-        bitmap_bh = read_block_bitmap(sb, block_group);
+        bitmap_bh = ext4_read_block_bitmap(sb, block_group);
        if (!bitmap_bh)
                goto error_return;
        gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
@@ -4309,10 +4486,9 @@ do_more:
                ext4_mb_free_metadata(handle, &e4b, block_group, bit, count);
        } else {
                ext4_lock_group(sb, block_group);
-                err = mb_free_blocks(inode, &e4b, bit, count);
+                mb_free_blocks(inode, &e4b, bit, count);
                ext4_mb_return_to_preallocation(inode, &e4b, block, count);
                ext4_unlock_group(sb, block_group);
-                BUG_ON(err != 0);
        }
        spin_lock(sb_bgl_lock(sbi, block_group));
@@ -4321,6 +4497,13 @@ do_more:
        spin_unlock(sb_bgl_lock(sbi, block_group));
        percpu_counter_add(&sbi->s_freeblocks_counter, count);
+        if (sbi->s_log_groups_per_flex) {
+                ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+                spin_lock(sb_bgl_lock(sbi, flex_group));
+                sbi->s_flex_groups[flex_group].free_blocks += count;
+                spin_unlock(sb_bgl_lock(sbi, flex_group));
+        }
        ext4_mb_release_desc(&e4b);
        *freed += count;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index ab16beaa830d..387ad98350c3 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -183,6 +183,16 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                             struct inode *inode);
 /*
+ * p is at least 6 bytes before the end of page
+ */
+static inline struct ext4_dir_entry_2 *
+ext4_next_entry(struct ext4_dir_entry_2 *p)
+{
+        return (struct ext4_dir_entry_2 *)((char *)p +
+                ext4_rec_len_from_disk(p->rec_len));
+}
+/*
 * Future: use high four bits of block for coalesce-on-delete flags
 * Mask them off for now.
 */
@@ -231,13 +241,13 @@ static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
 {
        unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
                EXT4_DIR_REC_LEN(2) - infosize;
-        return 0? 20: entry_space / sizeof(struct dx_entry);
+        return entry_space / sizeof(struct dx_entry);
 }
 static inline unsigned dx_node_limit (struct inode *dir)
 {
        unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
-        return 0? 22: entry_space / sizeof(struct dx_entry);
+        return entry_space / sizeof(struct dx_entry);
 }
 /*
@@ -554,15 +564,6 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
 /*
- * p is at least 6 bytes before the end of page
- */
-static inline struct ext4_dir_entry_2 *ext4_next_entry(struct ext4_dir_entry_2 *p)
-{
-        return (struct ext4_dir_entry_2 *)((char *)p +
-                ext4_rec_len_from_disk(p->rec_len));
-}
-/*
 * This function fills a red-black tree with information from a
 * directory block.  It returns the number directory entries loaded
 * into the tree.  If there is an error it is returned in err.
@@ -993,19 +994,21 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
                de = (struct ext4_dir_entry_2 *) bh->b_data;
                top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
                                       EXT4_DIR_REC_LEN(0));
-                for (; de < top; de = ext4_next_entry(de))
+                for (; de < top; de = ext4_next_entry(de)) {
-                if (ext4_match (namelen, name, de)) {
+                        int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
-                        if (!ext4_check_dir_entry("ext4_find_entry",
+                                  + ((char *) de - bh->b_data);
-                                                  dir, de, bh,
-                                  (block<<EXT4_BLOCK_SIZE_BITS(sb))
+                        if (!ext4_check_dir_entry(__func__, dir, de, bh, off)) {
-                                          +((char *)de - bh->b_data))) {
+                                brelse(bh);
-                                brelse (bh);
                                *err = ERR_BAD_DX_DIR;
                                goto errout;
                        }
-                        *res_dir = de;
-                        dx_release (frames);
+                        if (ext4_match(namelen, name, de)) {
-                        return bh;
+                                *res_dir = de;
+                                dx_release(frames);
+                                return bh;
+                        }
                }
                brelse (bh);
                /* Check to see if we should continue to search */
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 9ff7b1c04239..f000fbe2cd93 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -866,6 +866,15 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
        /*
+         * We can allocate memory for mb_alloc based on the new group
+         * descriptor
+         */
+        if (test_opt(sb, MBALLOC)) {
+                err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
+                if (err)
+                        goto exit_journal;
+        }
+        /*
         * Make the new blocks and inodes valid next.  We do this before
         * increasing the group count so that once the group is enabled,
         * all of its blocks and inodes are already valid.
@@ -957,6 +966,8 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
        handle_t *handle;
        int err;
        unsigned long freed_blocks;
+        ext4_group_t group;
+        struct ext4_group_info *grp;
        /* We don't need to worry about locking wrt other resizers just
         * yet: we're going to revalidate es->s_blocks_count after
@@ -988,7 +999,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
        }
        /* Handle the remaining blocks in the last group only. */
-        ext4_get_group_no_and_offset(sb, o_blocks_count, NULL, &last);
+        ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
        if (last == 0) {
                ext4_warning(sb, __func__,
@@ -1060,6 +1071,45 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
                   o_blocks_count + add);
        if ((err = ext4_journal_stop(handle)))
                goto exit_put;
+        /*
+         * Mark mballoc pages as not up to date so that they will be updated
+         * next time they are loaded by ext4_mb_load_buddy.
+         */
+        if (test_opt(sb, MBALLOC)) {
+                struct ext4_sb_info *sbi = EXT4_SB(sb);
+                struct inode *inode = sbi->s_buddy_cache;
+                int blocks_per_page;
+                int block;
+                int pnum;
+                struct page *page;
+                /* Set buddy page as not up to date */
+                blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+                block = group * 2;
+                pnum = block / blocks_per_page;
+                page = find_get_page(inode->i_mapping, pnum);
+                if (page != NULL) {
+                        ClearPageUptodate(page);
+                        page_cache_release(page);
+                }
+                /* Set bitmap page as not up to date */
+                block++;
+                pnum = block / blocks_per_page;
+                page = find_get_page(inode->i_mapping, pnum);
+                if (page != NULL) {
+                        ClearPageUptodate(page);
+                        page_cache_release(page);
+                }
+                /* Get the info on the last group */
+                grp = ext4_get_group_info(sb, group);
+                /* Update free blocks in group info */
+                ext4_mb_update_group_info(grp, add);
+        }
        if (test_opt(sb, DEBUG))
                printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
                       ext4_blocks_count(es));
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 02bf24343979..1cb371dcd609 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -506,6 +506,7 @@ static void ext4_put_super (struct super_block * sb)
        ext4_ext_release(sb);
        ext4_xattr_put_super(sb);
        jbd2_journal_destroy(sbi->s_journal);
+        sbi->s_journal = NULL;
        if (!(sb->s_flags & MS_RDONLY)) {
                EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
                es->s_state = cpu_to_le16(sbi->s_mount_state);
@@ -517,6 +518,7 @@ static void ext4_put_super (struct super_block * sb)
        for (i = 0; i < sbi->s_gdb_count; i++)
                brelse(sbi->s_group_desc[i]);
        kfree(sbi->s_group_desc);
+        kfree(sbi->s_flex_groups);
        percpu_counter_destroy(&sbi->s_freeblocks_counter);
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
        percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -571,6 +573,12 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
        INIT_LIST_HEAD(&ei->i_prealloc_list);
        spin_lock_init(&ei->i_prealloc_lock);
+        jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
+        ei->i_reserved_data_blocks = 0;
+        ei->i_reserved_meta_blocks = 0;
+        ei->i_allocated_meta_blocks = 0;
+        ei->i_delalloc_reserved_flag = 0;
+        spin_lock_init(&(ei->i_block_reservation_lock));
        return &ei->vfs_inode;
 }
@@ -635,6 +643,8 @@ static void ext4_clear_inode(struct inode *inode)
        EXT4_I(inode)->i_block_alloc_info = NULL;
        if (unlikely(rsv))
                kfree(rsv);
+        jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
+                                       &EXT4_I(inode)->jinode);
 }
 static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb)
@@ -671,7 +681,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
        unsigned long def_mount_opts;
        struct super_block *sb = vfs->mnt_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        journal_t *journal = sbi->s_journal;
        struct ext4_super_block *es = sbi->s_es;
        def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
@@ -747,6 +756,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_puts(seq, ",nomballoc");
        if (test_opt(sb, I_VERSION))
                seq_puts(seq, ",i_version");
+        if (!test_opt(sb, DELALLOC))
+                seq_puts(seq, ",nodelalloc");
        if (sbi->s_stripe)
                seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
@@ -894,7 +906,7 @@ enum {
        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
        Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
        Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
-        Opt_mballoc, Opt_nomballoc, Opt_stripe,
+        Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc,
 };
 static match_table_t tokens = {
@@ -953,6 +965,8 @@ static match_table_t tokens = {
        {Opt_nomballoc, "nomballoc"},
        {Opt_stripe, "stripe=%u"},
        {Opt_resize, "resize"},
+        {Opt_delalloc, "delalloc"},
+        {Opt_nodelalloc, "nodelalloc"},
        {Opt_err, NULL},
 };
@@ -990,6 +1004,7 @@ static int parse_options (char *options, struct super_block *sb,
        int qtype, qfmt;
        char *qname;
 #endif
+        ext4_fsblk_t last_block;
        if (!options)
                return 1;
@@ -1309,15 +1324,39 @@ set_qf_format:
                        clear_opt(sbi->s_mount_opt, NOBH);
                        break;
                case Opt_extents:
+                        if (!EXT4_HAS_INCOMPAT_FEATURE(sb,
+                                        EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+                                ext4_warning(sb, __func__,
+                                        "extents feature not enabled "
+                                        "on this filesystem, use tune2fs\n");
+                                return 0;
+                        }
                        set_opt (sbi->s_mount_opt, EXTENTS);
                        break;
                case Opt_noextents:
+                        /*
+                         * When e2fsprogs support resizing an already existing
+                         * ext3 file system to greater than 2**32 we need to
+                         * add support to block allocator to handle growing
+                         * already existing block  mapped inode so that blocks
+                         * allocated for them fall within 2**32
+                         */
+                        last_block = ext4_blocks_count(sbi->s_es) - 1;
+                        if (last_block  > 0xffffffffULL) {
+                                printk(KERN_ERR "EXT4-fs: Filesystem too "
+                                                "large to mount with "
+                                                "-o noextents options\n");
+                                return 0;
+                        }
                        clear_opt (sbi->s_mount_opt, EXTENTS);
                        break;
                case Opt_i_version:
                        set_opt(sbi->s_mount_opt, I_VERSION);
                        sb->s_flags |= MS_I_VERSION;
                        break;
+                case Opt_nodelalloc:
+                        clear_opt(sbi->s_mount_opt, DELALLOC);
+                        break;
                case Opt_mballoc:
                        set_opt(sbi->s_mount_opt, MBALLOC);
                        break;
@@ -1331,6 +1370,9 @@ set_qf_format:
                                return 0;
                        sbi->s_stripe = option;
                        break;
+                case Opt_delalloc:
+                        set_opt(sbi->s_mount_opt, DELALLOC);
+                        break;
                default:
                        printk (KERN_ERR
                                "EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1443,6 +1485,54 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
        return res;
 }
+static int ext4_fill_flex_info(struct super_block *sb)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_group_desc *gdp = NULL;
+        struct buffer_head *bh;
+        ext4_group_t flex_group_count;
+        ext4_group_t flex_group;
+        int groups_per_flex = 0;
+        __u64 block_bitmap = 0;
+        int i;
+        if (!sbi->s_es->s_log_groups_per_flex) {
+                sbi->s_log_groups_per_flex = 0;
+                return 1;
+        }
+        sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
+        groups_per_flex = 1 << sbi->s_log_groups_per_flex;
+        flex_group_count = (sbi->s_groups_count + groups_per_flex - 1) /
+                groups_per_flex;
+        sbi->s_flex_groups = kmalloc(flex_group_count *
+                                     sizeof(struct flex_groups), GFP_KERNEL);
+        if (sbi->s_flex_groups == NULL) {
+                printk(KERN_ERR "EXT4-fs: not enough memory\n");
+                goto failed;
+        }
+        memset(sbi->s_flex_groups, 0, flex_group_count *
+               sizeof(struct flex_groups));
+        gdp = ext4_get_group_desc(sb, 1, &bh);
+        block_bitmap = ext4_block_bitmap(sb, gdp) - 1;
+        for (i = 0; i < sbi->s_groups_count; i++) {
+                gdp = ext4_get_group_desc(sb, i, &bh);
+                flex_group = ext4_flex_group(sbi, i);
+                sbi->s_flex_groups[flex_group].free_inodes +=
+                        le16_to_cpu(gdp->bg_free_inodes_count);
+                sbi->s_flex_groups[flex_group].free_blocks +=
+                        le16_to_cpu(gdp->bg_free_blocks_count);
+        }
+        return 1;
+failed:
+        return 0;
+}
 __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
                            struct ext4_group_desc *gdp)
 {
@@ -1810,8 +1900,8 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
 }
 static int ext4_fill_super (struct super_block *sb, void *data, int silent)
-                                __releases(kernel_sem)
+                                __releases(kernel_lock)
-                                __acquires(kernel_sem)
+                                __acquires(kernel_lock)
 {
        struct buffer_head * bh;
@@ -1851,11 +1941,6 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
                goto out_fail;
        }
-        if (!sb_set_blocksize(sb, blocksize)) {
-                printk(KERN_ERR "EXT4-fs: bad blocksize %d.\n", blocksize);
-                goto out_fail;
-        }
        /*
         * The ext4 superblock will not be buffer aligned for other than 1kB
         * block sizes.  We need to calculate the offset from buffer start.
@@ -1919,15 +2004,28 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
        /*
         * turn on extents feature by default in ext4 filesystem
-         * User -o noextents to turn it off
+         * only if feature flag already set by mkfs or tune2fs.
+         * Use -o noextents to turn it off
         */
-        set_opt(sbi->s_mount_opt, EXTENTS);
+        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
+                set_opt(sbi->s_mount_opt, EXTENTS);
+        else
+                ext4_warning(sb, __func__,
+                        "extents feature not enabled on this filesystem, "
+                        "use tune2fs.\n");
        /*
-         * turn on mballoc feature by default in ext4 filesystem
+         * turn on mballoc code by default in ext4 filesystem
-         * User -o nomballoc to turn it off
+         * Use -o nomballoc to turn it off
         */
        set_opt(sbi->s_mount_opt, MBALLOC);
+        /*
+         * enable delayed allocation by default
+         * Use -o nodelalloc to turn it off
+         */
+        set_opt(sbi->s_mount_opt, DELALLOC);
        if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
                            NULL, 0))
                goto failed_mount;
@@ -2138,6 +2236,14 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
                printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n");
                goto failed_mount2;
        }
+        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+                if (!ext4_fill_flex_info(sb)) {
+                        printk(KERN_ERR
+                               "EXT4-fs: unable to initialize "
+                               "flex_bg meta info!\n");
+                        goto failed_mount2;
+                }
        sbi->s_gdb_count = db_count;
        get_random_bytes(&sbi->s_next_generation, sizeof(u32));
        spin_lock_init(&sbi->s_next_gen_lock);
@@ -2358,6 +2464,13 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
                test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
                "writeback");
+        if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+                printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
+                                "requested data journaling mode\n");
+                clear_opt(sbi->s_mount_opt, DELALLOC);
+        } else if (test_opt(sb, DELALLOC))
+                printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
        ext4_ext_init(sb);
        ext4_mb_init(sb, needs_recovery);
@@ -2372,6 +2485,7 @@ cantfind_ext4:
 failed_mount4:
        jbd2_journal_destroy(sbi->s_journal);
+        sbi->s_journal = NULL;
 failed_mount3:
        percpu_counter_destroy(&sbi->s_freeblocks_counter);
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
@@ -3325,7 +3439,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
                        err = ext4_journal_dirty_metadata(handle, bh);
                else {
                        /* Always do at least ordered writes for quotas */
-                        err = ext4_journal_dirty_data(handle, bh);
+                        err = ext4_jbd2_file_inode(handle, inode);
                        mark_buffer_dirty(bh);
                }
                brelse(bh);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index ff08633f398e..93c5fdcdad2e 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -810,7 +810,7 @@ inserted:
                        /* We need to allocate a new block */
                        ext4_fsblk_t goal = ext4_group_first_block_no(sb,
                                                EXT4_I(inode)->i_block_group);
-                        ext4_fsblk_t block = ext4_new_block(handle, inode,
+                        ext4_fsblk_t block = ext4_new_meta_block(handle, inode,
                                                        goal, &error);
                        if (error)
                                goto cleanup;
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index fff33382cadc..ac1a52cf2a37 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -13,13 +13,11 @@
 #include "ext4.h"
 #include "xattr.h"
-#define XATTR_TRUSTED_PREFIX "trusted."
 static size_t
 ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
                        const char *name, size_t name_len)
 {
-        const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1;
+        const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
        const size_t total_len = prefix_len + name_len + 1;
        if (!capable(CAP_SYS_ADMIN))
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index 67be723fcc4e..d91aa61b42aa 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -12,13 +12,11 @@
 #include "ext4.h"
 #include "xattr.h"
-#define XATTR_USER_PREFIX "user."
 static size_t
 ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size,
                     const char *name, size_t name_len)
 {
-        const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1;
+        const size_t prefix_len = XATTR_USER_PREFIX_LEN;
        const size_t total_len = prefix_len + name_len + 1;
        if (!test_opt(inode->i_sb, XATTR_USER))
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index fda25479af26..3a9ecac8d61f 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -61,7 +61,7 @@ void fat_cache_destroy(void)
 static inline struct fat_cache *fat_cache_alloc(struct inode *inode)
 {
-        return kmem_cache_alloc(fat_cache_cachep, GFP_KERNEL);
+        return kmem_cache_alloc(fat_cache_cachep, GFP_NOFS);
 }
 static inline void fat_cache_free(struct fat_cache *cache)
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 486725ee99ae..34541d06e626 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -472,7 +472,7 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
        loff_t cpos;
        int ret = 0;
-        lock_kernel();
+        lock_super(sb);
        cpos = filp->f_pos;
        /* Fake . and .. for the root directory. */
@@ -654,7 +654,7 @@ FillFailed:
        if (unicode)
                __putname(unicode);
 out:
-        unlock_kernel();
+        unlock_super(sb);
        return ret;
 }
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 771326b8047e..c672df4036e9 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -11,7 +11,6 @@
 #include <linux/mount.h>
 #include <linux/time.h>
 #include <linux/msdos_fs.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
@@ -242,9 +241,7 @@ void fat_truncate(struct inode *inode)
        nr_clusters = (inode->i_size + (cluster_size - 1)) >> sbi->cluster_bits;
-        lock_kernel();
        fat_free(inode, nr_clusters);
-        unlock_kernel();
        fat_flush_inodes(inode->i_sb, inode, NULL);
 }
@@ -310,8 +307,6 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
        int error = 0;
        unsigned int ia_valid;
-        lock_kernel();
        /*
         * Expand the file. Since inode_setattr() updates ->i_size
         * before calling the ->truncate(), but FAT needs to fill the
@@ -366,7 +361,6 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
        error = inode_setattr(inode, attr);
 out:
-        unlock_kernel();
        return error;
 }
 EXPORT_SYMBOL_GPL(fat_setattr);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 4e0a3dd9d677..46a4508ffd2e 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -440,14 +440,13 @@ static void fat_delete_inode(struct inode *inode)
 static void fat_clear_inode(struct inode *inode)
 {
-        struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+        struct super_block *sb = inode->i_sb;
+        struct msdos_sb_info *sbi = MSDOS_SB(sb);
-        lock_kernel();
        spin_lock(&sbi->inode_hash_lock);
        fat_cache_inval_inode(inode);
        hlist_del_init(&MSDOS_I(inode)->i_fat_hash);
        spin_unlock(&sbi->inode_hash_lock);
-        unlock_kernel();
 }
 static void fat_write_super(struct super_block *sb)
@@ -485,7 +484,7 @@ static struct kmem_cache *fat_inode_cachep;
 static struct inode *fat_alloc_inode(struct super_block *sb)
 {
        struct msdos_inode_info *ei;
-        ei = kmem_cache_alloc(fat_inode_cachep, GFP_KERNEL);
+        ei = kmem_cache_alloc(fat_inode_cachep, GFP_NOFS);
        if (!ei)
                return NULL;
        return &ei->vfs_inode;
@@ -567,7 +566,7 @@ retry:
        if (inode->i_ino == MSDOS_ROOT_INO || !i_pos)
                return 0;
-        lock_kernel();
+        lock_super(sb);
        bh = sb_bread(sb, i_pos >> sbi->dir_per_block_bits);
        if (!bh) {
                printk(KERN_ERR "FAT: unable to read inode block "
@@ -579,7 +578,7 @@ retry:
        if (i_pos != MSDOS_I(inode)->i_pos) {
                spin_unlock(&sbi->inode_hash_lock);
                brelse(bh);
-                unlock_kernel();
+                unlock_super(sb);
                goto retry;
        }
@@ -606,7 +605,7 @@ retry:
                err = sync_dirty_buffer(bh);
        brelse(bh);
 out:
-        unlock_kernel();
+        unlock_super(sb);
        return err;
 }
@@ -736,6 +735,7 @@ fat_encode_fh(struct dentry *de, __u32 *fh, int *lenp, int connectable)
 static struct dentry *fat_get_parent(struct dentry *child)
 {
+        struct super_block *sb = child->d_sb;
        struct buffer_head *bh;
        struct msdos_dir_entry *de;
        loff_t i_pos;
@@ -743,14 +743,14 @@ static struct dentry *fat_get_parent(struct dentry *child)
        struct inode *inode;
        int err;
-        lock_kernel();
+        lock_super(sb);
        err = fat_get_dotdot_entry(child->d_inode, &bh, &de, &i_pos);
        if (err) {
                parent = ERR_PTR(err);
                goto out;
        }
-        inode = fat_build_inode(child->d_sb, de, i_pos);
+        inode = fat_build_inode(sb, de, i_pos);
        brelse(bh);
        if (IS_ERR(inode)) {
                parent = ERR_CAST(inode);
@@ -762,7 +762,7 @@ static struct dentry *fat_get_parent(struct dentry *child)
                parent = ERR_PTR(-ENOMEM);
        }
 out:
-        unlock_kernel();
+        unlock_super(sb);
        return parent;
 }
@@ -1172,6 +1172,12 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
        long error;
        char buf[50];
+        /*
+         * GFP_KERNEL is ok here, because while we do hold the
+         * supeblock lock, memory pressure can't call back into
+         * the filesystem, since we're only just about to mount
+         * it and have no inodes etc active!
+         */
        sbi = kzalloc(sizeof(struct msdos_sb_info), GFP_KERNEL);
        if (!sbi)
                return -ENOMEM;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index bfd776509a72..330a7d782591 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -12,7 +12,6 @@
 #include <linux/fdtable.h>
 #include <linux/capability.h>
 #include <linux/dnotify.h>
-#include <linux/smp_lock.h>
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/security.h>
@@ -227,7 +226,6 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
        if (error)
                return error;
-        lock_kernel();
        if ((arg ^ filp->f_flags) & FASYNC) {
                if (filp->f_op && filp->f_op->fasync) {
                        error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0);
@@ -238,7 +236,6 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
        filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
 out:
-        unlock_kernel();
        return error;
 }
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 7f7947e3dfbb..ab2f57e3fb87 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -14,23 +14,11 @@ config GFS2_FS
          GFS is perfect consistency -- changes made to the filesystem on one
          machine show up immediately on all other machines in the cluster.
-          To use the GFS2 filesystem, you will need to enable one or more of
+          To use the GFS2 filesystem in a cluster, you will need to enable
-          the below locking modules. Documentation and utilities for GFS2 can
+          the locking module below. Documentation and utilities for GFS2 can
          be found here: http://sources.redhat.com/cluster
-config GFS2_FS_LOCKING_NOLOCK
+          The "nolock" lock module is now built in to GFS2 by default.
-        tristate "GFS2 \"nolock\" locking module"
-        depends on GFS2_FS
-        help
-          Single node locking module for GFS2.
-          Use this module if you want to use GFS2 on a single node without
-          its clustering features. You can still take advantage of the
-          large file support, and upgrade to running a full cluster later on
-          if required.
-          If you will only be using GFS2 in cluster mode, you do not need this
-          module.
 config GFS2_FS_LOCKING_DLM
        tristate "GFS2 DLM locking module"
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index e2350df02a07..ec65851ec80a 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -5,6 +5,5 @@ gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \
        ops_fstype.o ops_inode.o ops_super.o quota.o \
        recovery.o rgrp.o super.o sys.o trans.o util.o
-obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += locking/nolock/
 obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += locking/dlm/
diff --git a/fs/gfs2/gfs2.h b/fs/gfs2/gfs2.h
index 3bb11c0f8b56..ef606e3a5cf4 100644
--- a/fs/gfs2/gfs2.h
+++ b/fs/gfs2/gfs2.h
@@ -16,11 +16,6 @@ enum {
 };
 enum {
-        NO_WAIT = 0,
-        WAIT = 1,
-};
-enum {
        NO_FORCE = 0,
        FORCE = 1,
 };
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index d636b3e80f5d..13391e546616 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -45,21 +45,19 @@ struct gfs2_gl_hash_bucket {
        struct hlist_head hb_list;
 };
-struct glock_iter {
+struct gfs2_glock_iter {
-        int hash;                     /* hash bucket index         */
+        int hash;                       /* hash bucket index         */
-        struct gfs2_sbd *sdp;         /* incore superblock         */
+        struct gfs2_sbd *sdp;           /* incore superblock         */
-        struct gfs2_glock *gl;        /* current glock struct      */
+        struct gfs2_glock *gl;          /* current glock struct      */
-        struct seq_file *seq;         /* sequence file for debugfs */
+        char string[512];               /* scratch space             */
-        char string[512];             /* scratch space             */
 };
 typedef void (*glock_examiner) (struct gfs2_glock * gl);
 static int gfs2_dump_lockstate(struct gfs2_sbd *sdp);
-static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl);
+static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
-static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh);
+#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0)
-static void gfs2_glock_drop_th(struct gfs2_glock *gl);
+static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target);
-static void run_queue(struct gfs2_glock *gl);
 static DECLARE_RWSEM(gfs2_umount_flush_sem);
 static struct dentry *gfs2_root;
@@ -123,33 +121,6 @@ static inline rwlock_t *gl_lock_addr(unsigned int x)
 #endif
 /**
- * relaxed_state_ok - is a requested lock compatible with the current lock mode?
- * @actual: the current state of the lock
- * @requested: the lock state that was requested by the caller
- * @flags: the modifier flags passed in by the caller
- *
- * Returns: 1 if the locks are compatible, 0 otherwise
- */
-static inline int relaxed_state_ok(unsigned int actual, unsigned requested,
-                                   int flags)
-{
-        if (actual == requested)
-                return 1;
-        if (flags & GL_EXACT)
-                return 0;
-        if (actual == LM_ST_EXCLUSIVE && requested == LM_ST_SHARED)
-                return 1;
-        if (actual != LM_ST_UNLOCKED && (flags & LM_FLAG_ANY))
-                return 1;
-        return 0;
-}
-/**
 * gl_hash() - Turn glock number into hash bucket number
 * @lock: The glock number
 *
@@ -182,7 +153,7 @@ static void glock_free(struct gfs2_glock *gl)
        struct gfs2_sbd *sdp = gl->gl_sbd;
        struct inode *aspace = gl->gl_aspace;
-        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+        if (sdp->sd_lockstruct.ls_ops->lm_put_lock)
                sdp->sd_lockstruct.ls_ops->lm_put_lock(gl->gl_lock);
        if (aspace)
@@ -211,17 +182,14 @@ static void gfs2_glock_hold(struct gfs2_glock *gl)
 int gfs2_glock_put(struct gfs2_glock *gl)
 {
        int rv = 0;
-        struct gfs2_sbd *sdp = gl->gl_sbd;
        write_lock(gl_lock_addr(gl->gl_hash));
        if (atomic_dec_and_test(&gl->gl_ref)) {
                hlist_del(&gl->gl_list);
                write_unlock(gl_lock_addr(gl->gl_hash));
-                gfs2_assert(sdp, gl->gl_state == LM_ST_UNLOCKED);
+                GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_UNLOCKED);
-                gfs2_assert(sdp, list_empty(&gl->gl_reclaim));
+                GLOCK_BUG_ON(gl, !list_empty(&gl->gl_reclaim));
-                gfs2_assert(sdp, list_empty(&gl->gl_holders));
+                GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
-                gfs2_assert(sdp, list_empty(&gl->gl_waiters1));
-                gfs2_assert(sdp, list_empty(&gl->gl_waiters3));
                glock_free(gl);
                rv = 1;
                goto out;
@@ -281,22 +249,401 @@ static struct gfs2_glock *gfs2_glock_find(const struct gfs2_sbd *sdp,
        return gl;
 }
+/**
+ * may_grant - check if its ok to grant a new lock
+ * @gl: The glock
+ * @gh: The lock request which we wish to grant
+ *
+ * Returns: true if its ok to grant the lock
+ */
+static inline int may_grant(const struct gfs2_glock *gl, const struct gfs2_holder *gh)
+{
+        const struct gfs2_holder *gh_head = list_entry(gl->gl_holders.next, const struct gfs2_holder, gh_list);
+        if ((gh->gh_state == LM_ST_EXCLUSIVE ||
+             gh_head->gh_state == LM_ST_EXCLUSIVE) && gh != gh_head)
+                return 0;
+        if (gl->gl_state == gh->gh_state)
+                return 1;
+        if (gh->gh_flags & GL_EXACT)
+                return 0;
+        if (gl->gl_state == LM_ST_EXCLUSIVE) {
+                if (gh->gh_state == LM_ST_SHARED && gh_head->gh_state == LM_ST_SHARED)
+                        return 1;
+                if (gh->gh_state == LM_ST_DEFERRED && gh_head->gh_state == LM_ST_DEFERRED)
+                        return 1;
+        }
+        if (gl->gl_state != LM_ST_UNLOCKED && (gh->gh_flags & LM_FLAG_ANY))
+                return 1;
+        return 0;
+}
+static void gfs2_holder_wake(struct gfs2_holder *gh)
+{
+        clear_bit(HIF_WAIT, &gh->gh_iflags);
+        smp_mb__after_clear_bit();
+        wake_up_bit(&gh->gh_iflags, HIF_WAIT);
+}
+/**
+ * do_promote - promote as many requests as possible on the current queue
+ * @gl: The glock
+ * 
+ * Returns: true if there is a blocked holder at the head of the list
+ */
+static int do_promote(struct gfs2_glock *gl)
+{
+        const struct gfs2_glock_operations *glops = gl->gl_ops;
+        struct gfs2_holder *gh, *tmp;
+        int ret;
+restart:
+        list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
+                if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+                        continue;
+                if (may_grant(gl, gh)) {
+                        if (gh->gh_list.prev == &gl->gl_holders &&
+                            glops->go_lock) {
+                                spin_unlock(&gl->gl_spin);
+                                /* FIXME: eliminate this eventually */
+                                ret = glops->go_lock(gh);
+                                spin_lock(&gl->gl_spin);
+                                if (ret) {
+                                        gh->gh_error = ret;
+                                        list_del_init(&gh->gh_list);
+                                        gfs2_holder_wake(gh);
+                                        goto restart;
+                                }
+                                set_bit(HIF_HOLDER, &gh->gh_iflags);
+                                gfs2_holder_wake(gh);
+                                goto restart;
+                        }
+                        set_bit(HIF_HOLDER, &gh->gh_iflags);
+                        gfs2_holder_wake(gh);
+                        continue;
+                }
+                if (gh->gh_list.prev == &gl->gl_holders)
+                        return 1;
+                break;
+        }
+        return 0;
+}
+/**
+ * do_error - Something unexpected has happened during a lock request
+ *
+ */
+static inline void do_error(struct gfs2_glock *gl, const int ret)
+{
+        struct gfs2_holder *gh, *tmp;
+        list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
+                if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+                        continue;
+                if (ret & LM_OUT_ERROR)
+                        gh->gh_error = -EIO;
+                else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))
+                        gh->gh_error = GLR_TRYFAILED;
+                else
+                        continue;
+                list_del_init(&gh->gh_list);
+                gfs2_holder_wake(gh);
+        }
+}
+/**
+ * find_first_waiter - find the first gh that's waiting for the glock
+ * @gl: the glock
+ */
+static inline struct gfs2_holder *find_first_waiter(const struct gfs2_glock *gl)
+{
+        struct gfs2_holder *gh;
+        list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+                if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
+                        return gh;
+        }
+        return NULL;
+}
+/**
+ * state_change - record that the glock is now in a different state
+ * @gl: the glock
+ * @new_state the new state
+ *
+ */
+static void state_change(struct gfs2_glock *gl, unsigned int new_state)
+{
+        int held1, held2;
+        held1 = (gl->gl_state != LM_ST_UNLOCKED);
+        held2 = (new_state != LM_ST_UNLOCKED);
+        if (held1 != held2) {
+                if (held2)
+                        gfs2_glock_hold(gl);
+                else
+                        gfs2_glock_put(gl);
+        }
+        gl->gl_state = new_state;
+        gl->gl_tchange = jiffies;
+}
+static void gfs2_demote_wake(struct gfs2_glock *gl)
+{
+        gl->gl_demote_state = LM_ST_EXCLUSIVE;
+        clear_bit(GLF_DEMOTE, &gl->gl_flags);
+        smp_mb__after_clear_bit();
+        wake_up_bit(&gl->gl_flags, GLF_DEMOTE);
+}
+/**
+ * finish_xmote - The DLM has replied to one of our lock requests
+ * @gl: The glock
+ * @ret: The status from the DLM
+ *
+ */
+static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
+{
+        const struct gfs2_glock_operations *glops = gl->gl_ops;
+        struct gfs2_holder *gh;
+        unsigned state = ret & LM_OUT_ST_MASK;
+        spin_lock(&gl->gl_spin);
+        state_change(gl, state);
+        gh = find_first_waiter(gl);
+        /* Demote to UN request arrived during demote to SH or DF */
+        if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
+            state != LM_ST_UNLOCKED && gl->gl_demote_state == LM_ST_UNLOCKED)
+                gl->gl_target = LM_ST_UNLOCKED;
+        /* Check for state != intended state */
+        if (unlikely(state != gl->gl_target)) {
+                if (gh && !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) {
+                        /* move to back of queue and try next entry */
+                        if (ret & LM_OUT_CANCELED) {
+                                if ((gh->gh_flags & LM_FLAG_PRIORITY) == 0)
+                                        list_move_tail(&gh->gh_list, &gl->gl_holders);
+                                gh = find_first_waiter(gl);
+                                gl->gl_target = gh->gh_state;
+                                goto retry;
+                        }
+                        /* Some error or failed "try lock" - report it */
+                        if ((ret & LM_OUT_ERROR) ||
+                            (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) {
+                                gl->gl_target = gl->gl_state;
+                                do_error(gl, ret);
+                                goto out;
+                        }
+                }
+                switch(state) {
+                /* Unlocked due to conversion deadlock, try again */
+                case LM_ST_UNLOCKED:
+retry:
+                        do_xmote(gl, gh, gl->gl_target);
+                        break;
+                /* Conversion fails, unlock and try again */
+                case LM_ST_SHARED:
+                case LM_ST_DEFERRED:
+                        do_xmote(gl, gh, LM_ST_UNLOCKED);
+                        break;
+                default: /* Everything else */
+                        printk(KERN_ERR "GFS2: wanted %u got %u\n", gl->gl_target, state);
+                        GLOCK_BUG_ON(gl, 1);
+                }
+                spin_unlock(&gl->gl_spin);
+                gfs2_glock_put(gl);
+                return;
+        }
+        /* Fast path - we got what we asked for */
+        if (test_and_clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags))
+                gfs2_demote_wake(gl);
+        if (state != LM_ST_UNLOCKED) {
+                if (glops->go_xmote_bh) {
+                        int rv;
+                        spin_unlock(&gl->gl_spin);
+                        rv = glops->go_xmote_bh(gl, gh);
+                        if (rv == -EAGAIN)
+                                return;
+                        spin_lock(&gl->gl_spin);
+                        if (rv) {
+                                do_error(gl, rv);
+                                goto out;
+                        }
+                }
+                do_promote(gl);
+        }
+out:
+        clear_bit(GLF_LOCK, &gl->gl_flags);
+        spin_unlock(&gl->gl_spin);
+        gfs2_glock_put(gl);
+}
+static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
+                                 unsigned int cur_state, unsigned int req_state,
+                                 unsigned int flags)
+{
+        int ret = LM_OUT_ERROR;
+        if (!sdp->sd_lockstruct.ls_ops->lm_lock)
+                return req_state == LM_ST_UNLOCKED ? 0 : req_state;
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state,
+                                                         req_state, flags);
+        return ret;
+}
+/**
+ * do_xmote - Calls the DLM to change the state of a lock
+ * @gl: The lock state
+ * @gh: The holder (only for promotes)
+ * @target: The target lock state
+ *
+ */
+static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target)
+{
+        const struct gfs2_glock_operations *glops = gl->gl_ops;
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        unsigned int lck_flags = gh ? gh->gh_flags : 0;
+        int ret;
+        lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
+                      LM_FLAG_PRIORITY);
+        BUG_ON(gl->gl_state == target);
+        BUG_ON(gl->gl_state == gl->gl_target);
+        if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) &&
+            glops->go_inval) {
+                set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
+                do_error(gl, 0); /* Fail queued try locks */
+        }
+        spin_unlock(&gl->gl_spin);
+        if (glops->go_xmote_th)
+                glops->go_xmote_th(gl);
+        if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
+                glops->go_inval(gl, target == LM_ST_DEFERRED ? 0 : DIO_METADATA);
+        clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
+        gfs2_glock_hold(gl);
+        if (target != LM_ST_UNLOCKED && (gl->gl_state == LM_ST_SHARED ||
+            gl->gl_state == LM_ST_DEFERRED) &&
+            !(lck_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
+                lck_flags |= LM_FLAG_TRY_1CB;
+        ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, target, lck_flags);
+        if (!(ret & LM_OUT_ASYNC)) {
+                finish_xmote(gl, ret);
+                gfs2_glock_hold(gl);
+                if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+                        gfs2_glock_put(gl);
+        } else {
+                GLOCK_BUG_ON(gl, ret != LM_OUT_ASYNC);
+        }
+        spin_lock(&gl->gl_spin);
+}
+/**
+ * find_first_holder - find the first "holder" gh
+ * @gl: the glock
+ */
+static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)
+{
+        struct gfs2_holder *gh;
+        if (!list_empty(&gl->gl_holders)) {
+                gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
+                if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+                        return gh;
+        }
+        return NULL;
+}
+/**
+ * run_queue - do all outstanding tasks related to a glock
+ * @gl: The glock in question
+ * @nonblock: True if we must not block in run_queue
+ *
+ */
+static void run_queue(struct gfs2_glock *gl, const int nonblock)
+{
+        struct gfs2_holder *gh = NULL;
+        if (test_and_set_bit(GLF_LOCK, &gl->gl_flags))
+                return;
+        GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags));
+        if (test_bit(GLF_DEMOTE, &gl->gl_flags) &&
+            gl->gl_demote_state != gl->gl_state) {
+                if (find_first_holder(gl))
+                        goto out;
+                if (nonblock)
+                        goto out_sched;
+                set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
+                GLOCK_BUG_ON(gl, gl->gl_demote_state == LM_ST_EXCLUSIVE);
+                gl->gl_target = gl->gl_demote_state;
+        } else {
+                if (test_bit(GLF_DEMOTE, &gl->gl_flags))
+                        gfs2_demote_wake(gl);
+                if (do_promote(gl) == 0)
+                        goto out;
+                gh = find_first_waiter(gl);
+                gl->gl_target = gh->gh_state;
+                if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
+                        do_error(gl, 0); /* Fail queued try locks */
+        }
+        do_xmote(gl, gh, gl->gl_target);
+        return;
+out_sched:
+        gfs2_glock_hold(gl);
+        if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+                gfs2_glock_put(gl);
+out:
+        clear_bit(GLF_LOCK, &gl->gl_flags);
+}
 static void glock_work_func(struct work_struct *work)
 {
+        unsigned long delay = 0;
        struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work);
+        if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags))
+                finish_xmote(gl, gl->gl_reply);
        spin_lock(&gl->gl_spin);
-        if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags))
+        if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
-                set_bit(GLF_DEMOTE, &gl->gl_flags);
+            gl->gl_state != LM_ST_UNLOCKED &&
-        run_queue(gl);
+            gl->gl_demote_state != LM_ST_EXCLUSIVE) {
+                unsigned long holdtime, now = jiffies;
+                holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
+                if (time_before(now, holdtime))
+                        delay = holdtime - now;
+                set_bit(delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE, &gl->gl_flags);
+        }
+        run_queue(gl, 0);
        spin_unlock(&gl->gl_spin);
-        gfs2_glock_put(gl);
+        if (!delay ||
+            queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
+                gfs2_glock_put(gl);
 }
 static int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
                     void **lockp)
 {
        int error = -EIO;
+        if (!sdp->sd_lockstruct.ls_ops->lm_get_lock)
+                return 0;
        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
                error = sdp->sd_lockstruct.ls_ops->lm_get_lock(
                                sdp->sd_lockstruct.ls_lockspace, name, lockp);
@@ -342,12 +689,10 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
        gl->gl_name = name;
        atomic_set(&gl->gl_ref, 1);
        gl->gl_state = LM_ST_UNLOCKED;
+        gl->gl_target = LM_ST_UNLOCKED;
        gl->gl_demote_state = LM_ST_EXCLUSIVE;
        gl->gl_hash = hash;
-        gl->gl_owner_pid = NULL;
-        gl->gl_ip = 0;
        gl->gl_ops = glops;
-        gl->gl_req_gh = NULL;
        gl->gl_stamp = jiffies;
        gl->gl_tchange = jiffies;
        gl->gl_object = NULL;
@@ -447,13 +792,6 @@ void gfs2_holder_uninit(struct gfs2_holder *gh)
        gh->gh_ip = 0;
 }
-static void gfs2_holder_wake(struct gfs2_holder *gh)
-{
-        clear_bit(HIF_WAIT, &gh->gh_iflags);
-        smp_mb__after_clear_bit();
-        wake_up_bit(&gh->gh_iflags, HIF_WAIT);
-}
 static int just_schedule(void *word)
 {
        schedule();
@@ -466,14 +804,6 @@ static void wait_on_holder(struct gfs2_holder *gh)
        wait_on_bit(&gh->gh_iflags, HIF_WAIT, just_schedule, TASK_UNINTERRUPTIBLE);
 }
-static void gfs2_demote_wake(struct gfs2_glock *gl)
-{
-        gl->gl_demote_state = LM_ST_EXCLUSIVE;
-        clear_bit(GLF_DEMOTE, &gl->gl_flags);
-        smp_mb__after_clear_bit();
-        wake_up_bit(&gl->gl_flags, GLF_DEMOTE);
-}
 static void wait_on_demote(struct gfs2_glock *gl)
 {
        might_sleep();
@@ -481,217 +811,6 @@ static void wait_on_demote(struct gfs2_glock *gl)
 }
 /**
- * rq_mutex - process a mutex request in the queue
- * @gh: the glock holder
- *
- * Returns: 1 if the queue is blocked
- */
-static int rq_mutex(struct gfs2_holder *gh)
-{
-        struct gfs2_glock *gl = gh->gh_gl;
-        list_del_init(&gh->gh_list);
-        /*  gh->gh_error never examined.  */
-        set_bit(GLF_LOCK, &gl->gl_flags);
-        clear_bit(HIF_WAIT, &gh->gh_iflags);
-        smp_mb();
-        wake_up_bit(&gh->gh_iflags, HIF_WAIT);
-        return 1;
-}
-/**
- * rq_promote - process a promote request in the queue
- * @gh: the glock holder
- *
- * Acquire a new inter-node lock, or change a lock state to more restrictive.
- *
- * Returns: 1 if the queue is blocked
- */
-static int rq_promote(struct gfs2_holder *gh)
-{
-        struct gfs2_glock *gl = gh->gh_gl;
-        if (!relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
-                if (list_empty(&gl->gl_holders)) {
-                        gl->gl_req_gh = gh;
-                        set_bit(GLF_LOCK, &gl->gl_flags);
-                        spin_unlock(&gl->gl_spin);
-                        gfs2_glock_xmote_th(gh->gh_gl, gh);
-                        spin_lock(&gl->gl_spin);
-                }
-                return 1;
-        }
-        if (list_empty(&gl->gl_holders)) {
-                set_bit(HIF_FIRST, &gh->gh_iflags);
-                set_bit(GLF_LOCK, &gl->gl_flags);
-        } else {
-                struct gfs2_holder *next_gh;
-                if (gh->gh_state == LM_ST_EXCLUSIVE)
-                        return 1;
-                next_gh = list_entry(gl->gl_holders.next, struct gfs2_holder,
-                                     gh_list);
-                if (next_gh->gh_state == LM_ST_EXCLUSIVE)
-                         return 1;
-        }
-        list_move_tail(&gh->gh_list, &gl->gl_holders);
-        gh->gh_error = 0;
-        set_bit(HIF_HOLDER, &gh->gh_iflags);
-        gfs2_holder_wake(gh);
-        return 0;
-}
-/**
- * rq_demote - process a demote request in the queue
- * @gh: the glock holder
- *
- * Returns: 1 if the queue is blocked
- */
-static int rq_demote(struct gfs2_glock *gl)
-{
-        if (!list_empty(&gl->gl_holders))
-                return 1;
-        if (gl->gl_state == gl->gl_demote_state ||
-            gl->gl_state == LM_ST_UNLOCKED) {
-                gfs2_demote_wake(gl);
-                return 0;
-        }
-        set_bit(GLF_LOCK, &gl->gl_flags);
-        set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
-        if (gl->gl_demote_state == LM_ST_UNLOCKED ||
-            gl->gl_state != LM_ST_EXCLUSIVE) {
-                spin_unlock(&gl->gl_spin);
-                gfs2_glock_drop_th(gl);
-        } else {
-                spin_unlock(&gl->gl_spin);
-                gfs2_glock_xmote_th(gl, NULL);
-        }
-        spin_lock(&gl->gl_spin);
-        clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
-        return 0;
-}
-/**
- * run_queue - process holder structures on a glock
- * @gl: the glock
- *
- */
-static void run_queue(struct gfs2_glock *gl)
-{
-        struct gfs2_holder *gh;
-        int blocked = 1;
-        for (;;) {
-                if (test_bit(GLF_LOCK, &gl->gl_flags))
-                        break;
-                if (!list_empty(&gl->gl_waiters1)) {
-                        gh = list_entry(gl->gl_waiters1.next,
-                                        struct gfs2_holder, gh_list);
-                        blocked = rq_mutex(gh);
-                } else if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
-                        blocked = rq_demote(gl);
-                        if (test_bit(GLF_WAITERS2, &gl->gl_flags) &&
-                                     !blocked) {
-                                set_bit(GLF_DEMOTE, &gl->gl_flags);
-                                gl->gl_demote_state = LM_ST_UNLOCKED;
-                        }
-                        clear_bit(GLF_WAITERS2, &gl->gl_flags);
-                } else if (!list_empty(&gl->gl_waiters3)) {
-                        gh = list_entry(gl->gl_waiters3.next,
-                                        struct gfs2_holder, gh_list);
-                        blocked = rq_promote(gh);
-                } else
-                        break;
-                if (blocked)
-                        break;
-        }
-}
-/**
- * gfs2_glmutex_lock - acquire a local lock on a glock
- * @gl: the glock
- *
- * Gives caller exclusive access to manipulate a glock structure.
- */
-static void gfs2_glmutex_lock(struct gfs2_glock *gl)
-{
-        spin_lock(&gl->gl_spin);
-        if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
-                struct gfs2_holder gh;
-                gfs2_holder_init(gl, 0, 0, &gh);
-                set_bit(HIF_WAIT, &gh.gh_iflags);
-                list_add_tail(&gh.gh_list, &gl->gl_waiters1);
-                spin_unlock(&gl->gl_spin);
-                wait_on_holder(&gh);
-                gfs2_holder_uninit(&gh);
-        } else {
-                gl->gl_owner_pid = get_pid(task_pid(current));
-                gl->gl_ip = (unsigned long)__builtin_return_address(0);
-                spin_unlock(&gl->gl_spin);
-        }
-}
-/**
- * gfs2_glmutex_trylock - try to acquire a local lock on a glock
- * @gl: the glock
- *
- * Returns: 1 if the glock is acquired
- */
-static int gfs2_glmutex_trylock(struct gfs2_glock *gl)
-{
-        int acquired = 1;
-        spin_lock(&gl->gl_spin);
-        if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
-                acquired = 0;
-        } else {
-                gl->gl_owner_pid = get_pid(task_pid(current));
-                gl->gl_ip = (unsigned long)__builtin_return_address(0);
-        }
-        spin_unlock(&gl->gl_spin);
-        return acquired;
-}
-/**
- * gfs2_glmutex_unlock - release a local lock on a glock
- * @gl: the glock
- *
- */
-static void gfs2_glmutex_unlock(struct gfs2_glock *gl)
-{
-        struct pid *pid;
-        spin_lock(&gl->gl_spin);
-        clear_bit(GLF_LOCK, &gl->gl_flags);
-        pid = gl->gl_owner_pid;
-        gl->gl_owner_pid = NULL;
-        gl->gl_ip = 0;
-        run_queue(gl);
-        spin_unlock(&gl->gl_spin);
-        put_pid(pid);
-}
-/**
 * handle_callback - process a demote request
 * @gl: the glock
 * @state: the state the caller wants us to change to
@@ -705,398 +824,45 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state,
 {
        int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE;
-        spin_lock(&gl->gl_spin);
        set_bit(bit, &gl->gl_flags);
        if (gl->gl_demote_state == LM_ST_EXCLUSIVE) {
                gl->gl_demote_state = state;
                gl->gl_demote_time = jiffies;
                if (remote && gl->gl_ops->go_type == LM_TYPE_IOPEN &&
-                    gl->gl_object) {
+                    gl->gl_object)
                        gfs2_glock_schedule_for_reclaim(gl);
-                        spin_unlock(&gl->gl_spin);
-                        return;
-                }
        } else if (gl->gl_demote_state != LM_ST_UNLOCKED &&
                        gl->gl_demote_state != state) {
-                if (test_bit(GLF_DEMOTE_IN_PROGRESS,  &gl->gl_flags)) 
+                gl->gl_demote_state = LM_ST_UNLOCKED;
-                        set_bit(GLF_WAITERS2, &gl->gl_flags);
-                else 
-                        gl->gl_demote_state = LM_ST_UNLOCKED;
-        }
-        spin_unlock(&gl->gl_spin);
-}
-/**
- * state_change - record that the glock is now in a different state
- * @gl: the glock
- * @new_state the new state
- *
- */
-static void state_change(struct gfs2_glock *gl, unsigned int new_state)
-{
-        int held1, held2;
-        held1 = (gl->gl_state != LM_ST_UNLOCKED);
-        held2 = (new_state != LM_ST_UNLOCKED);
-        if (held1 != held2) {
-                if (held2)
-                        gfs2_glock_hold(gl);
-                else
-                        gfs2_glock_put(gl);
        }
-        gl->gl_state = new_state;
-        gl->gl_tchange = jiffies;
 }
 /**
- * drop_bh - Called after a lock module unlock completes
+ * gfs2_glock_wait - wait on a glock acquisition
- * @gl: the glock
- * @ret: the return status
- *
- * Doesn't wake up the process waiting on the struct gfs2_holder (if any)
- * Doesn't drop the reference on the glock the top half took out
- *
- */
-static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
-{
-        struct gfs2_sbd *sdp = gl->gl_sbd;
-        struct gfs2_holder *gh = gl->gl_req_gh;
-        gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
-        gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
-        gfs2_assert_warn(sdp, !ret);
-        state_change(gl, LM_ST_UNLOCKED);
-        if (test_and_clear_bit(GLF_CONV_DEADLK, &gl->gl_flags)) {
-                spin_lock(&gl->gl_spin);
-                gh->gh_error = 0;
-                spin_unlock(&gl->gl_spin);
-                gfs2_glock_xmote_th(gl, gl->gl_req_gh);
-                gfs2_glock_put(gl);
-                return;
-        }
-        spin_lock(&gl->gl_spin);
-        gfs2_demote_wake(gl);
-        clear_bit(GLF_LOCK, &gl->gl_flags);
-        spin_unlock(&gl->gl_spin);
-        gfs2_glock_put(gl);
-}
-/**
- * xmote_bh - Called after the lock module is done acquiring a lock
- * @gl: The glock in question
- * @ret: the int returned from the lock module
- *
- */
-static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
-{
-        struct gfs2_sbd *sdp = gl->gl_sbd;
-        const struct gfs2_glock_operations *glops = gl->gl_ops;
-        struct gfs2_holder *gh = gl->gl_req_gh;
-        int op_done = 1;
-        if (!gh && (ret & LM_OUT_ST_MASK) == LM_ST_UNLOCKED) {
-                drop_bh(gl, ret);
-                return;
-        }
-        gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
-        gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
-        gfs2_assert_warn(sdp, !(ret & LM_OUT_ASYNC));
-        state_change(gl, ret & LM_OUT_ST_MASK);
-        /*  Deal with each possible exit condition  */
-        if (!gh) {
-                gl->gl_stamp = jiffies;
-                if (ret & LM_OUT_CANCELED) {
-                        op_done = 0;
-                } else {
-                        spin_lock(&gl->gl_spin);
-                        if (gl->gl_state != gl->gl_demote_state) {
-                                spin_unlock(&gl->gl_spin);
-                                gfs2_glock_drop_th(gl);
-                                gfs2_glock_put(gl);
-                                return;
-                        }
-                        gfs2_demote_wake(gl);
-                        spin_unlock(&gl->gl_spin);
-                }
-        } else {
-                spin_lock(&gl->gl_spin);
-                if (ret & LM_OUT_CONV_DEADLK) {
-                        gh->gh_error = 0;
-                        set_bit(GLF_CONV_DEADLK, &gl->gl_flags);
-                        spin_unlock(&gl->gl_spin);
-                        gfs2_glock_drop_th(gl);
-                        gfs2_glock_put(gl);
-                        return;
-                }
-                list_del_init(&gh->gh_list);
-                gh->gh_error = -EIO;
-                if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) 
-                        goto out;
-                gh->gh_error = GLR_CANCELED;
-                if (ret & LM_OUT_CANCELED) 
-                        goto out;
-                if (relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
-                        list_add_tail(&gh->gh_list, &gl->gl_holders);
-                        gh->gh_error = 0;
-                        set_bit(HIF_HOLDER, &gh->gh_iflags);
-                        set_bit(HIF_FIRST, &gh->gh_iflags);
-                        op_done = 0;
-                        goto out;
-                }
-                gh->gh_error = GLR_TRYFAILED;
-                if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))
-                        goto out;
-                gh->gh_error = -EINVAL;
-                if (gfs2_assert_withdraw(sdp, 0) == -1)
-                        fs_err(sdp, "ret = 0x%.8X\n", ret);
-out:
-                spin_unlock(&gl->gl_spin);
-        }
-        if (glops->go_xmote_bh)
-                glops->go_xmote_bh(gl);
-        if (op_done) {
-                spin_lock(&gl->gl_spin);
-                gl->gl_req_gh = NULL;
-                clear_bit(GLF_LOCK, &gl->gl_flags);
-                spin_unlock(&gl->gl_spin);
-        }
-        gfs2_glock_put(gl);
-        if (gh)
-                gfs2_holder_wake(gh);
-}
-static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
-                                 unsigned int cur_state, unsigned int req_state,
-                                 unsigned int flags)
-{
-        int ret = 0;
-        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-                ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state,
-                                                         req_state, flags);
-        return ret;
-}
-/**
- * gfs2_glock_xmote_th - Call into the lock module to acquire or change a glock
- * @gl: The glock in question
- * @state: the requested state
- * @flags: modifier flags to the lock call
- *
- */
-static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh)
-{
-        struct gfs2_sbd *sdp = gl->gl_sbd;
-        int flags = gh ? gh->gh_flags : 0;
-        unsigned state = gh ? gh->gh_state : gl->gl_demote_state;
-        const struct gfs2_glock_operations *glops = gl->gl_ops;
-        int lck_flags = flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB |
-                                 LM_FLAG_NOEXP | LM_FLAG_ANY |
-                                 LM_FLAG_PRIORITY);
-        unsigned int lck_ret;
-        if (glops->go_xmote_th)
-                glops->go_xmote_th(gl);
-        if (state == LM_ST_DEFERRED && glops->go_inval)
-                glops->go_inval(gl, DIO_METADATA);
-        gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
-        gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
-        gfs2_assert_warn(sdp, state != LM_ST_UNLOCKED);
-        gfs2_assert_warn(sdp, state != gl->gl_state);
-        gfs2_glock_hold(gl);
-        lck_ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, state, lck_flags);
-        if (gfs2_assert_withdraw(sdp, !(lck_ret & LM_OUT_ERROR)))
-                return;
-        if (lck_ret & LM_OUT_ASYNC)
-                gfs2_assert_warn(sdp, lck_ret == LM_OUT_ASYNC);
-        else
-                xmote_bh(gl, lck_ret);
-}
-static unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, void *lock,
-                                   unsigned int cur_state)
-{
-        int ret = 0;
-        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-                ret =  sdp->sd_lockstruct.ls_ops->lm_unlock(lock, cur_state);
-        return ret;
-}
-/**
- * gfs2_glock_drop_th - call into the lock module to unlock a lock
- * @gl: the glock
- *
- */
-static void gfs2_glock_drop_th(struct gfs2_glock *gl)
-{
-        struct gfs2_sbd *sdp = gl->gl_sbd;
-        const struct gfs2_glock_operations *glops = gl->gl_ops;
-        unsigned int ret;
-        if (glops->go_xmote_th)
-                glops->go_xmote_th(gl);
-        if (glops->go_inval)
-                glops->go_inval(gl, DIO_METADATA);
-        gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
-        gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
-        gfs2_assert_warn(sdp, gl->gl_state != LM_ST_UNLOCKED);
-        gfs2_glock_hold(gl);
-        ret = gfs2_lm_unlock(sdp, gl->gl_lock, gl->gl_state);
-        if (gfs2_assert_withdraw(sdp, !(ret & LM_OUT_ERROR)))
-                return;
-        if (!ret)
-                drop_bh(gl, ret);
-        else
-                gfs2_assert_warn(sdp, ret == LM_OUT_ASYNC);
-}
-/**
- * do_cancels - cancel requests for locks stuck waiting on an expire flag
- * @gh: the LM_FLAG_PRIORITY holder waiting to acquire the lock
- *
- * Don't cancel GL_NOCANCEL requests.
- */
-static void do_cancels(struct gfs2_holder *gh)
-{
-        struct gfs2_glock *gl = gh->gh_gl;
-        struct gfs2_sbd *sdp = gl->gl_sbd;
-        spin_lock(&gl->gl_spin);
-        while (gl->gl_req_gh != gh &&
-               !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
-               !list_empty(&gh->gh_list)) {
-                if (!(gl->gl_req_gh && (gl->gl_req_gh->gh_flags & GL_NOCANCEL))) {
-                        spin_unlock(&gl->gl_spin);
-                        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-                                sdp->sd_lockstruct.ls_ops->lm_cancel(gl->gl_lock);
-                        msleep(100);
-                        spin_lock(&gl->gl_spin);
-                } else {
-                        spin_unlock(&gl->gl_spin);
-                        msleep(100);
-                        spin_lock(&gl->gl_spin);
-                }
-        }
-        spin_unlock(&gl->gl_spin);
-}
-/**
- * glock_wait_internal - wait on a glock acquisition
 * @gh: the glock holder
 *
 * Returns: 0 on success
 */
-static int glock_wait_internal(struct gfs2_holder *gh)
+int gfs2_glock_wait(struct gfs2_holder *gh)
 {
-        struct gfs2_glock *gl = gh->gh_gl;
-        struct gfs2_sbd *sdp = gl->gl_sbd;
-        const struct gfs2_glock_operations *glops = gl->gl_ops;
-        if (test_bit(HIF_ABORTED, &gh->gh_iflags))
-                return -EIO;
-        if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
-                spin_lock(&gl->gl_spin);
-                if (gl->gl_req_gh != gh &&
-                    !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
-                    !list_empty(&gh->gh_list)) {
-                        list_del_init(&gh->gh_list);
-                        gh->gh_error = GLR_TRYFAILED;
-                        run_queue(gl);
-                        spin_unlock(&gl->gl_spin);
-                        return gh->gh_error;
-                }
-                spin_unlock(&gl->gl_spin);
-        }
-        if (gh->gh_flags & LM_FLAG_PRIORITY)
-                do_cancels(gh);
        wait_on_holder(gh);
-        if (gh->gh_error)
-                return gh->gh_error;
-        gfs2_assert_withdraw(sdp, test_bit(HIF_HOLDER, &gh->gh_iflags));
-        gfs2_assert_withdraw(sdp, relaxed_state_ok(gl->gl_state, gh->gh_state,
-                                                   gh->gh_flags));
-        if (test_bit(HIF_FIRST, &gh->gh_iflags)) {
-                gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
-                if (glops->go_lock) {
-                        gh->gh_error = glops->go_lock(gh);
-                        if (gh->gh_error) {
-                                spin_lock(&gl->gl_spin);
-                                list_del_init(&gh->gh_list);
-                                spin_unlock(&gl->gl_spin);
-                        }
-                }
-                spin_lock(&gl->gl_spin);
-                gl->gl_req_gh = NULL;
-                clear_bit(GLF_LOCK, &gl->gl_flags);
-                run_queue(gl);
-                spin_unlock(&gl->gl_spin);
-        }
        return gh->gh_error;
 }
-static inline struct gfs2_holder *
+void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
-find_holder_by_owner(struct list_head *head, struct pid *pid)
-{
-        struct gfs2_holder *gh;
-        list_for_each_entry(gh, head, gh_list) {
-                if (gh->gh_owner_pid == pid)
-                        return gh;
-        }
-        return NULL;
-}
-static void print_dbg(struct glock_iter *gi, const char *fmt, ...)
 {
        va_list args;
        va_start(args, fmt);
-        if (gi) {
+        if (seq) {
+                struct gfs2_glock_iter *gi = seq->private;
                vsprintf(gi->string, fmt, args);
-                seq_printf(gi->seq, gi->string);
+                seq_printf(seq, gi->string);
-        }
+        } else {
-        else
+                printk(KERN_ERR " ");
                vprintk(fmt, args);
+        }
        va_end(args);
 }
@@ -1104,50 +870,76 @@ static void print_dbg(struct glock_iter *gi, const char *fmt, ...)
 * add_to_queue - Add a holder to the wait queue (but look for recursion)
 * @gh: the holder structure to add
 *
+ * Eventually we should move the recursive locking trap to a
+ * debugging option or something like that. This is the fast
+ * path and needs to have the minimum number of distractions.
+ * 
 */
-static void add_to_queue(struct gfs2_holder *gh)
+static inline void add_to_queue(struct gfs2_holder *gh)
 {
        struct gfs2_glock *gl = gh->gh_gl;
-        struct gfs2_holder *existing;
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        struct list_head *insert_pt = NULL;
+        struct gfs2_holder *gh2;
+        int try_lock = 0;
        BUG_ON(gh->gh_owner_pid == NULL);
        if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags))
                BUG();
-        if (!(gh->gh_flags & GL_FLOCK)) {
+        if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
-                existing = find_holder_by_owner(&gl->gl_holders, 
+                if (test_bit(GLF_LOCK, &gl->gl_flags))
-                                                gh->gh_owner_pid);
+                        try_lock = 1;
-                if (existing) {
+                if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
-                        print_symbol(KERN_WARNING "original: %s\n", 
+                        goto fail;
-                                     existing->gh_ip);
+        }
-                        printk(KERN_INFO "pid : %d\n",
-                                        pid_nr(existing->gh_owner_pid));
+        list_for_each_entry(gh2, &gl->gl_holders, gh_list) {
-                        printk(KERN_INFO "lock type : %d lock state : %d\n",
+                if (unlikely(gh2->gh_owner_pid == gh->gh_owner_pid &&
-                               existing->gh_gl->gl_name.ln_type, 
+                    (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK)))
-                               existing->gh_gl->gl_state);
+                        goto trap_recursive;
-                        print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
+                if (try_lock &&
-                        printk(KERN_INFO "pid : %d\n",
+                    !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) &&
-                                        pid_nr(gh->gh_owner_pid));
+                    !may_grant(gl, gh)) {
-                        printk(KERN_INFO "lock type : %d lock state : %d\n",
+fail:
-                               gl->gl_name.ln_type, gl->gl_state);
+                        gh->gh_error = GLR_TRYFAILED;
-                        BUG();
+                        gfs2_holder_wake(gh);
-                }
+                        return;
-                
-                existing = find_holder_by_owner(&gl->gl_waiters3, 
-                                                gh->gh_owner_pid);
-                if (existing) {
-                        print_symbol(KERN_WARNING "original: %s\n", 
-                                     existing->gh_ip);
-                        print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
-                        BUG();
                }
+                if (test_bit(HIF_HOLDER, &gh2->gh_iflags))
+                        continue;
+                if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt))
+                        insert_pt = &gh2->gh_list;
+        }
+        if (likely(insert_pt == NULL)) {
+                list_add_tail(&gh->gh_list, &gl->gl_holders);
+                if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY))
+                        goto do_cancel;
+                return;
+        }
+        list_add_tail(&gh->gh_list, insert_pt);
+do_cancel:
+        gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
+        if (!(gh->gh_flags & LM_FLAG_PRIORITY)) {
+                spin_unlock(&gl->gl_spin);
+                if (sdp->sd_lockstruct.ls_ops->lm_cancel)
+                        sdp->sd_lockstruct.ls_ops->lm_cancel(gl->gl_lock);
+                spin_lock(&gl->gl_spin);
        }
+        return;
-        if (gh->gh_flags & LM_FLAG_PRIORITY)
+trap_recursive:
-                list_add(&gh->gh_list, &gl->gl_waiters3);
+        print_symbol(KERN_ERR "original: %s\n", gh2->gh_ip);
-        else
+        printk(KERN_ERR "pid: %d\n", pid_nr(gh2->gh_owner_pid));
-                list_add_tail(&gh->gh_list, &gl->gl_waiters3);
+        printk(KERN_ERR "lock type: %d req lock state : %d\n",
+               gh2->gh_gl->gl_name.ln_type, gh2->gh_state);
+        print_symbol(KERN_ERR "new: %s\n", gh->gh_ip);
+        printk(KERN_ERR "pid: %d\n", pid_nr(gh->gh_owner_pid));
+        printk(KERN_ERR "lock type: %d req lock state : %d\n",
+               gh->gh_gl->gl_name.ln_type, gh->gh_state);
+        __dump_glock(NULL, gl);
+        BUG();
 }
 /**
@@ -1165,24 +957,16 @@ int gfs2_glock_nq(struct gfs2_holder *gh)
        struct gfs2_sbd *sdp = gl->gl_sbd;
        int error = 0;
-restart:
+        if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-        if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
-                set_bit(HIF_ABORTED, &gh->gh_iflags);
                return -EIO;
-        }
        spin_lock(&gl->gl_spin);
        add_to_queue(gh);
-        run_queue(gl);
+        run_queue(gl, 1);
        spin_unlock(&gl->gl_spin);
-        if (!(gh->gh_flags & GL_ASYNC)) {
+        if (!(gh->gh_flags & GL_ASYNC))
-                error = glock_wait_internal(gh);
+                error = gfs2_glock_wait(gh);
-                if (error == GLR_CANCELED) {
-                        msleep(100);
-                        goto restart;
-                }
-        }
        return error;
 }
@@ -1196,48 +980,7 @@ restart:
 int gfs2_glock_poll(struct gfs2_holder *gh)
 {
-        struct gfs2_glock *gl = gh->gh_gl;
+        return test_bit(HIF_WAIT, &gh->gh_iflags) ? 0 : 1;
-        int ready = 0;
-        spin_lock(&gl->gl_spin);
-        if (test_bit(HIF_HOLDER, &gh->gh_iflags))
-                ready = 1;
-        else if (list_empty(&gh->gh_list)) {
-                if (gh->gh_error == GLR_CANCELED) {
-                        spin_unlock(&gl->gl_spin);
-                        msleep(100);
-                        if (gfs2_glock_nq(gh))
-                                return 1;
-                        return 0;
-                } else
-                        ready = 1;
-        }
-        spin_unlock(&gl->gl_spin);
-        return ready;
-}
-/**
- * gfs2_glock_wait - wait for a lock acquisition that ended in a GLR_ASYNC
- * @gh: the holder structure
- *
- * Returns: 0, GLR_TRYFAILED, or errno on failure
- */
-int gfs2_glock_wait(struct gfs2_holder *gh)
-{
-        int error;
-        error = glock_wait_internal(gh);
-        if (error == GLR_CANCELED) {
-                msleep(100);
-                gh->gh_flags &= ~GL_ASYNC;
-                error = gfs2_glock_nq(gh);
-        }
-        return error;
 }
 /**
@@ -1251,26 +994,30 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
        struct gfs2_glock *gl = gh->gh_gl;
        const struct gfs2_glock_operations *glops = gl->gl_ops;
        unsigned delay = 0;
+        int fast_path = 0;
+        spin_lock(&gl->gl_spin);
        if (gh->gh_flags & GL_NOCACHE)
                handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
-        gfs2_glmutex_lock(gl);
-        spin_lock(&gl->gl_spin);
        list_del_init(&gh->gh_list);
+        if (find_first_holder(gl) == NULL) {
-        if (list_empty(&gl->gl_holders)) {
                if (glops->go_unlock) {
+                        GLOCK_BUG_ON(gl, test_and_set_bit(GLF_LOCK, &gl->gl_flags));
                        spin_unlock(&gl->gl_spin);
                        glops->go_unlock(gh);
                        spin_lock(&gl->gl_spin);
+                        clear_bit(GLF_LOCK, &gl->gl_flags);
                }
                gl->gl_stamp = jiffies;
+                if (list_empty(&gl->gl_holders) &&
+                    !test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
+                    !test_bit(GLF_DEMOTE, &gl->gl_flags))
+                        fast_path = 1;
        }
-        clear_bit(GLF_LOCK, &gl->gl_flags);
        spin_unlock(&gl->gl_spin);
+        if (likely(fast_path))
+                return;
        gfs2_glock_hold(gl);
        if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
@@ -1454,6 +1201,8 @@ void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
 static int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp)
 {
        int error = -EIO;
+        if (!sdp->sd_lockstruct.ls_ops->lm_hold_lvb)
+                return 0;
        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
                error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp);
        return error;
@@ -1469,20 +1218,14 @@ int gfs2_lvb_hold(struct gfs2_glock *gl)
 {
        int error;
-        gfs2_glmutex_lock(gl);
        if (!atomic_read(&gl->gl_lvb_count)) {
                error = gfs2_lm_hold_lvb(gl->gl_sbd, gl->gl_lock, &gl->gl_lvb);
-                if (error) {
+                if (error) 
-                        gfs2_glmutex_unlock(gl);
                        return error;
-                }
                gfs2_glock_hold(gl);
        }
        atomic_inc(&gl->gl_lvb_count);
-        gfs2_glmutex_unlock(gl);
        return 0;
 }
@@ -1497,17 +1240,13 @@ void gfs2_lvb_unhold(struct gfs2_glock *gl)
        struct gfs2_sbd *sdp = gl->gl_sbd;
        gfs2_glock_hold(gl);
-        gfs2_glmutex_lock(gl);
        gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count) > 0);
        if (atomic_dec_and_test(&gl->gl_lvb_count)) {
-                if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                if (sdp->sd_lockstruct.ls_ops->lm_unhold_lvb)
                        sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(gl->gl_lock, gl->gl_lvb);
                gl->gl_lvb = NULL;
                gfs2_glock_put(gl);
        }
-        gfs2_glmutex_unlock(gl);
        gfs2_glock_put(gl);
 }
@@ -1527,7 +1266,9 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
        if (time_before(now, holdtime))
                delay = holdtime - now;
+        spin_lock(&gl->gl_spin);
        handle_callback(gl, state, 1, delay);
+        spin_unlock(&gl->gl_spin);
        if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
                gfs2_glock_put(gl);
 }
@@ -1568,7 +1309,8 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
                gl = gfs2_glock_find(sdp, &async->lc_name);
                if (gfs2_assert_warn(sdp, gl))
                        return;
-                xmote_bh(gl, async->lc_ret);
+                gl->gl_reply = async->lc_ret;
+                set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
                if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
                        gfs2_glock_put(gl);
                up_read(&gfs2_umount_flush_sem);
@@ -1581,11 +1323,6 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
                        wake_up_process(sdp->sd_recoverd_process);
                return;
-        case LM_CB_DROPLOCKS:
-                gfs2_gl_hash_clear(sdp, NO_WAIT);
-                gfs2_quota_scan(sdp);
-                return;
        default:
                gfs2_assert_warn(sdp, 0);
                return;
@@ -1646,6 +1383,7 @@ void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
 void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
 {
        struct gfs2_glock *gl;
+        int done_callback = 0;
        spin_lock(&sdp->sd_reclaim_lock);
        if (list_empty(&sdp->sd_reclaim_list)) {
@@ -1660,14 +1398,16 @@ void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
        atomic_dec(&sdp->sd_reclaim_count);
        atomic_inc(&sdp->sd_reclaimed);
-        if (gfs2_glmutex_trylock(gl)) {
+        spin_lock(&gl->gl_spin);
-                if (list_empty(&gl->gl_holders) &&
+        if (find_first_holder(gl) == NULL &&
-                    gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
+            gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) {
-                        handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
+                handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
-                gfs2_glmutex_unlock(gl);
+                done_callback = 1;
        }
+        spin_unlock(&gl->gl_spin);
-        gfs2_glock_put(gl);
+        if (!done_callback ||
+            queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+                gfs2_glock_put(gl);
 }
 /**
@@ -1724,18 +1464,14 @@ static void scan_glock(struct gfs2_glock *gl)
 {
        if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object)
                return;
+        if (test_bit(GLF_LOCK, &gl->gl_flags))
+                return;
-        if (gfs2_glmutex_trylock(gl)) {
+        spin_lock(&gl->gl_spin);
-                if (list_empty(&gl->gl_holders) &&
+        if (find_first_holder(gl) == NULL &&
-                    gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
+            gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
-                        goto out_schedule;
+                gfs2_glock_schedule_for_reclaim(gl);
-                gfs2_glmutex_unlock(gl);
+        spin_unlock(&gl->gl_spin);
-        }
-        return;
-out_schedule:
-        gfs2_glmutex_unlock(gl);
-        gfs2_glock_schedule_for_reclaim(gl);
 }
 /**
@@ -1760,12 +1496,13 @@ static void clear_glock(struct gfs2_glock *gl)
                spin_unlock(&sdp->sd_reclaim_lock);
        }
-        if (gfs2_glmutex_trylock(gl)) {
+        spin_lock(&gl->gl_spin);
-                if (list_empty(&gl->gl_holders) &&
+        if (find_first_holder(gl) == NULL && gl->gl_state != LM_ST_UNLOCKED)
-                    gl->gl_state != LM_ST_UNLOCKED)
+                handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
-                        handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
+        spin_unlock(&gl->gl_spin);
-                gfs2_glmutex_unlock(gl);
+        gfs2_glock_hold(gl);
-        }
+        if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+                gfs2_glock_put(gl);
 }
 /**
@@ -1773,11 +1510,10 @@ static void clear_glock(struct gfs2_glock *gl)
 * @sdp: the filesystem
 * @wait: wait until it's all gone
 *
- * Called when unmounting the filesystem, or when inter-node lock manager
+ * Called when unmounting the filesystem.
- * requests DROPLOCKS because it is running out of capacity.
 */
-void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
+void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
 {
        unsigned long t;
        unsigned int x;
@@ -1792,7 +1528,7 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
                                cont = 1;
                }
-                if (!wait || !cont)
+                if (!cont)
                        break;
                if (time_after_eq(jiffies,
@@ -1810,180 +1546,164 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
        }
 }
-/*
+static const char *state2str(unsigned state)
- *  Diagnostic routines to help debug distributed deadlock
- */
-static void gfs2_print_symbol(struct glock_iter *gi, const char *fmt,
-                              unsigned long address)
 {
-        char buffer[KSYM_SYMBOL_LEN];
+        switch(state) {
+        case LM_ST_UNLOCKED:
-        sprint_symbol(buffer, address);
+                return "UN";
-        print_dbg(gi, fmt, buffer);
+        case LM_ST_SHARED:
+                return "SH";
+        case LM_ST_DEFERRED:
+                return "DF";
+        case LM_ST_EXCLUSIVE:
+                return "EX";
+        }
+        return "??";
+}
+static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
+{
+        char *p = buf;
+        if (flags & LM_FLAG_TRY)
+                *p++ = 't';
+        if (flags & LM_FLAG_TRY_1CB)
+                *p++ = 'T';
+        if (flags & LM_FLAG_NOEXP)
+                *p++ = 'e';
+        if (flags & LM_FLAG_ANY)
+                *p++ = 'a';
+        if (flags & LM_FLAG_PRIORITY)
+                *p++ = 'p';
+        if (flags & GL_ASYNC)
+                *p++ = 'a';
+        if (flags & GL_EXACT)
+                *p++ = 'E';
+        if (flags & GL_ATIME)
+                *p++ = 'a';
+        if (flags & GL_NOCACHE)
+                *p++ = 'c';
+        if (test_bit(HIF_HOLDER, &iflags))
+                *p++ = 'H';
+        if (test_bit(HIF_WAIT, &iflags))
+                *p++ = 'W';
+        if (test_bit(HIF_FIRST, &iflags))
+                *p++ = 'F';
+        *p = 0;
+        return buf;
 }
 /**
 * dump_holder - print information about a glock holder
- * @str: a string naming the type of holder
+ * @seq: the seq_file struct
 * @gh: the glock holder
 *
 * Returns: 0 on success, -ENOBUFS when we run out of space
 */
-static int dump_holder(struct glock_iter *gi, char *str,
+static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
-                       struct gfs2_holder *gh)
 {
-        unsigned int x;
+        struct task_struct *gh_owner = NULL;
-        struct task_struct *gh_owner;
+        char buffer[KSYM_SYMBOL_LEN];
+        char flags_buf[32];
-        print_dbg(gi, "  %s\n", str);
+        sprint_symbol(buffer, gh->gh_ip);
-        if (gh->gh_owner_pid) {
+        if (gh->gh_owner_pid)
-                print_dbg(gi, "    owner = %ld ",
-                                (long)pid_nr(gh->gh_owner_pid));
                gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID);
-                if (gh_owner)
+        gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %s\n",
-                        print_dbg(gi, "(%s)\n", gh_owner->comm);
+                  state2str(gh->gh_state),
-                else
+                  hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags),
-                        print_dbg(gi, "(ended)\n");
+                  gh->gh_error, 
-        } else
+                  gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
-                print_dbg(gi, "    owner = -1\n");
+                  gh_owner ? gh_owner->comm : "(ended)", buffer);
-        print_dbg(gi, "    gh_state = %u\n", gh->gh_state);
-        print_dbg(gi, "    gh_flags =");
-        for (x = 0; x < 32; x++)
-                if (gh->gh_flags & (1 << x))
-                        print_dbg(gi, " %u", x);
-        print_dbg(gi, " \n");
-        print_dbg(gi, "    error = %d\n", gh->gh_error);
-        print_dbg(gi, "    gh_iflags =");
-        for (x = 0; x < 32; x++)
-                if (test_bit(x, &gh->gh_iflags))
-                        print_dbg(gi, " %u", x);
-        print_dbg(gi, " \n");
-        gfs2_print_symbol(gi, "    initialized at: %s\n", gh->gh_ip);
        return 0;
 }
-/**
+static const char *gflags2str(char *buf, const unsigned long *gflags)
- * dump_inode - print information about an inode
+{
- * @ip: the inode
+        char *p = buf;
- *
+        if (test_bit(GLF_LOCK, gflags))
- * Returns: 0 on success, -ENOBUFS when we run out of space
+                *p++ = 'l';
- */
+        if (test_bit(GLF_STICKY, gflags))
+                *p++ = 's';
-static int dump_inode(struct glock_iter *gi, struct gfs2_inode *ip)
+        if (test_bit(GLF_DEMOTE, gflags))
-{
+                *p++ = 'D';
-        unsigned int x;
+        if (test_bit(GLF_PENDING_DEMOTE, gflags))
+                *p++ = 'd';
-        print_dbg(gi, "  Inode:\n");
+        if (test_bit(GLF_DEMOTE_IN_PROGRESS, gflags))
-        print_dbg(gi, "    num = %llu/%llu\n",
+                *p++ = 'p';
-                  (unsigned long long)ip->i_no_formal_ino,
+        if (test_bit(GLF_DIRTY, gflags))
-                  (unsigned long long)ip->i_no_addr);
+                *p++ = 'y';
-        print_dbg(gi, "    type = %u\n", IF2DT(ip->i_inode.i_mode));
+        if (test_bit(GLF_LFLUSH, gflags))
-        print_dbg(gi, "    i_flags =");
+                *p++ = 'f';
-        for (x = 0; x < 32; x++)
+        if (test_bit(GLF_INVALIDATE_IN_PROGRESS, gflags))
-                if (test_bit(x, &ip->i_flags))
+                *p++ = 'i';
-                        print_dbg(gi, " %u", x);
+        if (test_bit(GLF_REPLY_PENDING, gflags))
-        print_dbg(gi, " \n");
+                *p++ = 'r';
-        return 0;
+        *p = 0;
+        return buf;
 }
 /**
- * dump_glock - print information about a glock
+ * __dump_glock - print information about a glock
+ * @seq: The seq_file struct
 * @gl: the glock
- * @count: where we are in the buffer
+ *
+ * The file format is as follows:
+ * One line per object, capital letters are used to indicate objects
+ * G = glock, I = Inode, R = rgrp, H = holder. Glocks are not indented,
+ * other objects are indented by a single space and follow the glock to
+ * which they are related. Fields are indicated by lower case letters
+ * followed by a colon and the field value, except for strings which are in
+ * [] so that its possible to see if they are composed of spaces for
+ * example. The field's are n = number (id of the object), f = flags,
+ * t = type, s = state, r = refcount, e = error, p = pid.
 *
 * Returns: 0 on success, -ENOBUFS when we run out of space
 */
-static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl)
+static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
 {
-        struct gfs2_holder *gh;
+        const struct gfs2_glock_operations *glops = gl->gl_ops;
-        unsigned int x;
+        unsigned long long dtime;
-        int error = -ENOBUFS;
+        const struct gfs2_holder *gh;
-        struct task_struct *gl_owner;
+        char gflags_buf[32];
+        int error = 0;
-        spin_lock(&gl->gl_spin);
+        dtime = jiffies - gl->gl_demote_time;
+        dtime *= 1000000/HZ; /* demote time in uSec */
+        if (!test_bit(GLF_DEMOTE, &gl->gl_flags))
+                dtime = 0;
+        gfs2_print_dbg(seq, "G:  s:%s n:%u/%llu f:%s t:%s d:%s/%llu l:%d a:%d r:%d\n",
+                  state2str(gl->gl_state),
+                  gl->gl_name.ln_type,
+                  (unsigned long long)gl->gl_name.ln_number,
+                  gflags2str(gflags_buf, &gl->gl_flags),
+                  state2str(gl->gl_target),
+                  state2str(gl->gl_demote_state), dtime,
+                  atomic_read(&gl->gl_lvb_count),
+                  atomic_read(&gl->gl_ail_count),
+                  atomic_read(&gl->gl_ref));
-        print_dbg(gi, "Glock 0x%p (%u, 0x%llx)\n", gl, gl->gl_name.ln_type,
-                   (unsigned long long)gl->gl_name.ln_number);
-        print_dbg(gi, "  gl_flags =");
-        for (x = 0; x < 32; x++) {
-                if (test_bit(x, &gl->gl_flags))
-                        print_dbg(gi, " %u", x);
-        }
-        if (!test_bit(GLF_LOCK, &gl->gl_flags))
-                print_dbg(gi, " (unlocked)");
-        print_dbg(gi, " \n");
-        print_dbg(gi, "  gl_ref = %d\n", atomic_read(&gl->gl_ref));
-        print_dbg(gi, "  gl_state = %u\n", gl->gl_state);
-        if (gl->gl_owner_pid) {
-                gl_owner = pid_task(gl->gl_owner_pid, PIDTYPE_PID);
-                if (gl_owner)
-                        print_dbg(gi, "  gl_owner = pid %d (%s)\n",
-                                  pid_nr(gl->gl_owner_pid), gl_owner->comm);
-                else
-                        print_dbg(gi, "  gl_owner = %d (ended)\n",
-                                  pid_nr(gl->gl_owner_pid));
-        } else
-                print_dbg(gi, "  gl_owner = -1\n");
-        print_dbg(gi, "  gl_ip = %lu\n", gl->gl_ip);
-        print_dbg(gi, "  req_gh = %s\n", (gl->gl_req_gh) ? "yes" : "no");
-        print_dbg(gi, "  lvb_count = %d\n", atomic_read(&gl->gl_lvb_count));
-        print_dbg(gi, "  object = %s\n", (gl->gl_object) ? "yes" : "no");
-        print_dbg(gi, "  reclaim = %s\n",
-                   (list_empty(&gl->gl_reclaim)) ? "no" : "yes");
-        if (gl->gl_aspace)
-                print_dbg(gi, "  aspace = 0x%p nrpages = %lu\n", gl->gl_aspace,
-                           gl->gl_aspace->i_mapping->nrpages);
-        else
-                print_dbg(gi, "  aspace = no\n");
-        print_dbg(gi, "  ail = %d\n", atomic_read(&gl->gl_ail_count));
-        if (gl->gl_req_gh) {
-                error = dump_holder(gi, "Request", gl->gl_req_gh);
-                if (error)
-                        goto out;
-        }
        list_for_each_entry(gh, &gl->gl_holders, gh_list) {
-                error = dump_holder(gi, "Holder", gh);
+                error = dump_holder(seq, gh);
                if (error)
                        goto out;
        }
-        list_for_each_entry(gh, &gl->gl_waiters1, gh_list) {
+        if (gl->gl_state != LM_ST_UNLOCKED && glops->go_dump)
-                error = dump_holder(gi, "Waiter1", gh);
+                error = glops->go_dump(seq, gl);
-                if (error)
-                        goto out;
-        }
-        list_for_each_entry(gh, &gl->gl_waiters3, gh_list) {
-                error = dump_holder(gi, "Waiter3", gh);
-                if (error)
-                        goto out;
-        }
-        if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
-                print_dbg(gi, "  Demotion req to state %u (%llu uS ago)\n",
-                          gl->gl_demote_state, (unsigned long long)
-                          (jiffies - gl->gl_demote_time)*(1000000/HZ));
-        }
-        if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object) {
-                if (!test_bit(GLF_LOCK, &gl->gl_flags) &&
-                        list_empty(&gl->gl_holders)) {
-                        error = dump_inode(gi, gl->gl_object);
-                        if (error)
-                                goto out;
-                } else {
-                        error = -ENOBUFS;
-                        print_dbg(gi, "  Inode: busy\n");
-                }
-        }
-        error = 0;
 out:
-        spin_unlock(&gl->gl_spin);
        return error;
 }
+static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
+{
+        int ret;
+        spin_lock(&gl->gl_spin);
+        ret = __dump_glock(seq, gl);
+        spin_unlock(&gl->gl_spin);
+        return ret;
+}
 /**
 * gfs2_dump_lockstate - print out the current lockstate
 * @sdp: the filesystem
@@ -2086,7 +1806,7 @@ void gfs2_glock_exit(void)
 module_param(scand_secs, uint, S_IRUGO|S_IWUSR);
 MODULE_PARM_DESC(scand_secs, "The number of seconds between scand runs");
-static int gfs2_glock_iter_next(struct glock_iter *gi)
+static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
 {
        struct gfs2_glock *gl;
@@ -2104,7 +1824,7 @@ restart:
                gfs2_glock_put(gl);
        if (gl && gi->gl == NULL)
                gi->hash++;
-        while(gi->gl == NULL) {
+        while (gi->gl == NULL) {
                if (gi->hash >= GFS2_GL_HASH_SIZE)
                        return 1;
                read_lock(gl_lock_addr(gi->hash));
@@ -2122,58 +1842,34 @@ restart:
        return 0;
 }
-static void gfs2_glock_iter_free(struct glock_iter *gi)
+static void gfs2_glock_iter_free(struct gfs2_glock_iter *gi)
 {
        if (gi->gl)
                gfs2_glock_put(gi->gl);
-        kfree(gi);
-}
-static struct glock_iter *gfs2_glock_iter_init(struct gfs2_sbd *sdp)
-{
-        struct glock_iter *gi;
-        gi = kmalloc(sizeof (*gi), GFP_KERNEL);
-        if (!gi)
-                return NULL;
-        gi->sdp = sdp;
-        gi->hash = 0;
-        gi->seq = NULL;
        gi->gl = NULL;
-        memset(gi->string, 0, sizeof(gi->string));
-        if (gfs2_glock_iter_next(gi)) {
-                gfs2_glock_iter_free(gi);
-                return NULL;
-        }
-        return gi;
 }
-static void *gfs2_glock_seq_start(struct seq_file *file, loff_t *pos)
+static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos)
 {
-        struct glock_iter *gi;
+        struct gfs2_glock_iter *gi = seq->private;
        loff_t n = *pos;
-        gi = gfs2_glock_iter_init(file->private);
+        gi->hash = 0;
-        if (!gi)
-                return NULL;
-        while(n--) {
+        do {
                if (gfs2_glock_iter_next(gi)) {
                        gfs2_glock_iter_free(gi);
                        return NULL;
                }
-        }
+        } while (n--);
-        return gi;
+        return gi->gl;
 }
-static void *gfs2_glock_seq_next(struct seq_file *file, void *iter_ptr,
+static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr,
                                 loff_t *pos)
 {
-        struct glock_iter *gi = iter_ptr;
+        struct gfs2_glock_iter *gi = seq->private;
        (*pos)++;
@@ -2182,24 +1878,18 @@ static void *gfs2_glock_seq_next(struct seq_file *file, void *iter_ptr,
                return NULL;
        }
-        return gi;
+        return gi->gl;
 }
-static void gfs2_glock_seq_stop(struct seq_file *file, void *iter_ptr)
+static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr)
 {
-        struct glock_iter *gi = iter_ptr;
+        struct gfs2_glock_iter *gi = seq->private;
-        if (gi)
+        gfs2_glock_iter_free(gi);
-                gfs2_glock_iter_free(gi);
 }
-static int gfs2_glock_seq_show(struct seq_file *file, void *iter_ptr)
+static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr)
 {
-        struct glock_iter *gi = iter_ptr;
+        return dump_glock(seq, iter_ptr);
-        gi->seq = file;
-        dump_glock(gi, gi->gl);
-        return 0;
 }
 static const struct seq_operations gfs2_glock_seq_ops = {
@@ -2211,17 +1901,14 @@ static const struct seq_operations gfs2_glock_seq_ops = {
 static int gfs2_debugfs_open(struct inode *inode, struct file *file)
 {
-        struct seq_file *seq;
+        int ret = seq_open_private(file, &gfs2_glock_seq_ops,
-        int ret;
+                                   sizeof(struct gfs2_glock_iter));
+        if (ret == 0) {
-        ret = seq_open(file, &gfs2_glock_seq_ops);
+                struct seq_file *seq = file->private_data;
-        if (ret)
+                struct gfs2_glock_iter *gi = seq->private;
-                return ret;
+                gi->sdp = inode->i_private;
+        }
-        seq = file->private_data;
+        return ret;
-        seq->private = inode->i_private;
-        return 0;
 }
 static const struct file_operations gfs2_debug_fops = {
@@ -2229,7 +1916,7 @@ static const struct file_operations gfs2_debug_fops = {
        .open    = gfs2_debugfs_open,
        .read    = seq_read,
        .llseek  = seq_lseek,
-        .release = seq_release
+        .release = seq_release_private,
 };
 int gfs2_create_debugfs_file(struct gfs2_sbd *sdp)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index cdad3e6f8150..971d92af70fc 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -26,11 +26,8 @@
 #define GL_SKIP                 0x00000100
 #define GL_ATIME                0x00000200
 #define GL_NOCACHE              0x00000400
-#define GL_FLOCK                0x00000800
-#define GL_NOCANCEL             0x00001000
 #define GLR_TRYFAILED           13
-#define GLR_CANCELED            14
 static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
 {
@@ -41,6 +38,8 @@ static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *
        spin_lock(&gl->gl_spin);
        pid = task_pid(current);
        list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+                if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
+                        break;
                if (gh->gh_owner_pid == pid)
                        goto out;
        }
@@ -70,7 +69,7 @@ static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
 {
        int ret;
        spin_lock(&gl->gl_spin);
-        ret = test_bit(GLF_DEMOTE, &gl->gl_flags) || !list_empty(&gl->gl_waiters3);
+        ret = test_bit(GLF_DEMOTE, &gl->gl_flags);
        spin_unlock(&gl->gl_spin);
        return ret;
 }
@@ -98,6 +97,7 @@ int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
 int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
 void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
 void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
+void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
 /**
 * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock
@@ -130,10 +130,9 @@ int gfs2_lvb_hold(struct gfs2_glock *gl);
 void gfs2_lvb_unhold(struct gfs2_glock *gl);
 void gfs2_glock_cb(void *cb_data, unsigned int type, void *data);
 void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
 void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
-void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait);
+void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
 int __init gfs2_glock_init(void);
 void gfs2_glock_exit(void);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 07d84d16cda4..c6c318c2a0f6 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -13,6 +13,7 @@
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/lm_interface.h>
+#include <linux/bio.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -172,26 +173,6 @@ static void inode_go_sync(struct gfs2_glock *gl)
 }
 /**
- * inode_go_xmote_bh - After promoting/demoting a glock
- * @gl: the glock
- *
- */
-static void inode_go_xmote_bh(struct gfs2_glock *gl)
-{
-        struct gfs2_holder *gh = gl->gl_req_gh;
-        struct buffer_head *bh;
-        int error;
-        if (gl->gl_state != LM_ST_UNLOCKED &&
-            (!gh || !(gh->gh_flags & GL_SKIP))) {
-                error = gfs2_meta_read(gl, gl->gl_name.ln_number, 0, &bh);
-                if (!error)
-                        brelse(bh);
-        }
-}
-/**
 * inode_go_inval - prepare a inode glock to be released
 * @gl: the glock
 * @flags:
@@ -267,6 +248,26 @@ static int inode_go_lock(struct gfs2_holder *gh)
 }
 /**
+ * inode_go_dump - print information about an inode
+ * @seq: The iterator
+ * @ip: the inode
+ *
+ * Returns: 0 on success, -ENOBUFS when we run out of space
+ */
+static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
+{
+        const struct gfs2_inode *ip = gl->gl_object;
+        if (ip == NULL)
+                return 0;
+        gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%08lx\n",
+                  (unsigned long long)ip->i_no_formal_ino,
+                  (unsigned long long)ip->i_no_addr,
+                  IF2DT(ip->i_inode.i_mode), ip->i_flags);
+        return 0;
+}
+/**
 * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock
 * @gl: the glock
 *
@@ -306,6 +307,22 @@ static void rgrp_go_unlock(struct gfs2_holder *gh)
 }
 /**
+ * rgrp_go_dump - print out an rgrp
+ * @seq: The iterator
+ * @gl: The glock in question
+ *
+ */
+static int rgrp_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
+{
+        const struct gfs2_rgrpd *rgd = gl->gl_object;
+        if (rgd == NULL)
+                return 0;
+        gfs2_print_dbg(seq, " R: n:%llu\n", (unsigned long long)rgd->rd_addr);
+        return 0;
+}
+/**
 * trans_go_sync - promote/demote the transaction glock
 * @gl: the glock
 * @state: the requested state
@@ -330,7 +347,7 @@ static void trans_go_sync(struct gfs2_glock *gl)
 *
 */
-static void trans_go_xmote_bh(struct gfs2_glock *gl)
+static int trans_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh)
 {
        struct gfs2_sbd *sdp = gl->gl_sbd;
        struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
@@ -338,8 +355,7 @@ static void trans_go_xmote_bh(struct gfs2_glock *gl)
        struct gfs2_log_header_host head;
        int error;
-        if (gl->gl_state != LM_ST_UNLOCKED &&
+        if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
-            test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
                j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
                error = gfs2_find_jhead(sdp->sd_jdesc, &head);
@@ -354,6 +370,7 @@ static void trans_go_xmote_bh(struct gfs2_glock *gl)
                        gfs2_log_pointers_init(sdp, head.lh_blkno);
                }
        }
+        return 0;
 }
 /**
@@ -375,12 +392,12 @@ const struct gfs2_glock_operations gfs2_meta_glops = {
 const struct gfs2_glock_operations gfs2_inode_glops = {
        .go_xmote_th = inode_go_sync,
-        .go_xmote_bh = inode_go_xmote_bh,
        .go_inval = inode_go_inval,
        .go_demote_ok = inode_go_demote_ok,
        .go_lock = inode_go_lock,
+        .go_dump = inode_go_dump,
        .go_type = LM_TYPE_INODE,
-        .go_min_hold_time = HZ / 10,
+        .go_min_hold_time = HZ / 5,
 };
 const struct gfs2_glock_operations gfs2_rgrp_glops = {
@@ -389,8 +406,9 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
        .go_demote_ok = rgrp_go_demote_ok,
        .go_lock = rgrp_go_lock,
        .go_unlock = rgrp_go_unlock,
+        .go_dump = rgrp_go_dump,
        .go_type = LM_TYPE_RGRP,
-        .go_min_hold_time = HZ / 10,
+        .go_min_hold_time = HZ / 5,
 };
 const struct gfs2_glock_operations gfs2_trans_glops = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index eabe5eac41da..448697a5c462 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -77,7 +77,6 @@ struct gfs2_rgrp_host {
 struct gfs2_rgrpd {
        struct list_head rd_list;       /* Link with superblock */
        struct list_head rd_list_mru;
-        struct list_head rd_recent;     /* Recently used rgrps */
        struct gfs2_glock *rd_gl;       /* Glock for this rgrp */
        u64 rd_addr;                    /* grp block disk address */
        u64 rd_data0;                   /* first data location */
@@ -128,20 +127,20 @@ struct gfs2_bufdata {
 struct gfs2_glock_operations {
        void (*go_xmote_th) (struct gfs2_glock *gl);
-        void (*go_xmote_bh) (struct gfs2_glock *gl);
+        int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh);
        void (*go_inval) (struct gfs2_glock *gl, int flags);
        int (*go_demote_ok) (struct gfs2_glock *gl);
        int (*go_lock) (struct gfs2_holder *gh);
        void (*go_unlock) (struct gfs2_holder *gh);
+        int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
        const int go_type;
        const unsigned long go_min_hold_time;
 };
 enum {
        /* States */
-        HIF_HOLDER              = 6,
+        HIF_HOLDER              = 6,  /* Set for gh that "holds" the glock */
        HIF_FIRST               = 7,
-        HIF_ABORTED             = 9,
        HIF_WAIT                = 10,
 };
@@ -154,20 +153,20 @@ struct gfs2_holder {
        unsigned gh_flags;
        int gh_error;
-        unsigned long gh_iflags;
+        unsigned long gh_iflags; /* HIF_... */
        unsigned long gh_ip;
 };
 enum {
-        GLF_LOCK                = 1,
+        GLF_LOCK                        = 1,
-        GLF_STICKY              = 2,
+        GLF_STICKY                      = 2,
-        GLF_DEMOTE              = 3,
+        GLF_DEMOTE                      = 3,
-        GLF_PENDING_DEMOTE      = 4,
+        GLF_PENDING_DEMOTE              = 4,
-        GLF_DIRTY               = 5,
+        GLF_DEMOTE_IN_PROGRESS          = 5,
-        GLF_DEMOTE_IN_PROGRESS  = 6,
+        GLF_DIRTY                       = 6,
-        GLF_LFLUSH              = 7,
+        GLF_LFLUSH                      = 7,
-        GLF_WAITERS2            = 8,
+        GLF_INVALIDATE_IN_PROGRESS      = 8,
-        GLF_CONV_DEADLK         = 9,
+        GLF_REPLY_PENDING               = 9,
 };
 struct gfs2_glock {
@@ -179,19 +178,14 @@ struct gfs2_glock {
        spinlock_t gl_spin;
        unsigned int gl_state;
+        unsigned int gl_target;
+        unsigned int gl_reply;
        unsigned int gl_hash;
        unsigned int gl_demote_state; /* state requested by remote node */
        unsigned long gl_demote_time; /* time of first demote request */
-        struct pid *gl_owner_pid;
-        unsigned long gl_ip;
        struct list_head gl_holders;
-        struct list_head gl_waiters1;   /* HIF_MUTEX */
-        struct list_head gl_waiters3;   /* HIF_PROMOTE */
        const struct gfs2_glock_operations *gl_ops;
-        struct gfs2_holder *gl_req_gh;
        void *gl_lock;
        char *gl_lvb;
        atomic_t gl_lvb_count;
@@ -427,7 +421,6 @@ struct gfs2_tune {
        unsigned int gt_quota_quantum; /* Secs between syncs to quota file */
        unsigned int gt_atime_quantum; /* Min secs between atime updates */
        unsigned int gt_new_files_jdata;
-        unsigned int gt_new_files_directio;
        unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
        unsigned int gt_stall_secs; /* Detects trouble! */
        unsigned int gt_complain_secs;
@@ -534,7 +527,6 @@ struct gfs2_sbd {
        struct mutex sd_rindex_mutex;
        struct list_head sd_rindex_list;
        struct list_head sd_rindex_mru_list;
-        struct list_head sd_rindex_recent_list;
        struct gfs2_rgrpd *sd_rindex_forward;
        unsigned int sd_rgrps;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 09453d057e41..6da0ab355b8a 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -504,7 +504,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
        }
        if (!is_root) {
-                error = permission(dir, MAY_EXEC, NULL);
+                error = gfs2_permission(dir, MAY_EXEC);
                if (error)
                        goto out;
        }
@@ -667,7 +667,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
 {
        int error;
-        error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL);
+        error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
        if (error)
                return error;
@@ -789,13 +789,8 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
                if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_JDATA) ||
                    gfs2_tune_get(sdp, gt_new_files_jdata))
                        di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
-                if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_DIRECTIO) ||
-                    gfs2_tune_get(sdp, gt_new_files_directio))
-                        di->di_flags |= cpu_to_be32(GFS2_DIF_DIRECTIO);
        } else if (S_ISDIR(mode)) {
                di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
-                                            GFS2_DIF_INHERIT_DIRECTIO);
-                di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
                                            GFS2_DIF_INHERIT_JDATA);
        }
@@ -1134,7 +1129,7 @@ int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
        if (IS_APPEND(&dip->i_inode))
                return -EPERM;
-        error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL);
+        error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
        if (error)
                return error;
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 580da454b38f..6074c2506f75 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -72,7 +72,6 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
 }
-void gfs2_inode_attr_in(struct gfs2_inode *ip);
 void gfs2_set_iop(struct inode *inode);
 struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
                                u64 no_addr, u64 no_formal_ino,
@@ -91,6 +90,7 @@ int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
                struct gfs2_inode *ip);
 int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
                   const struct gfs2_inode *ip);
+int gfs2_permission(struct inode *inode, int mask);
 int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to);
 int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len);
 int gfs2_glock_nq_atime(struct gfs2_holder *gh);
diff --git a/fs/gfs2/locking.c b/fs/gfs2/locking.c
index 663fee728783..523243a13a21 100644
--- a/fs/gfs2/locking.c
+++ b/fs/gfs2/locking.c
@@ -23,12 +23,54 @@ struct lmh_wrapper {
        const struct lm_lockops *lw_ops;
 };
+static int nolock_mount(char *table_name, char *host_data,
+                        lm_callback_t cb, void *cb_data,
+                        unsigned int min_lvb_size, int flags,
+                        struct lm_lockstruct *lockstruct,
+                        struct kobject *fskobj);
 /* List of registered low-level locking protocols.  A file system selects one
   of them by name at mount time, e.g. lock_nolock, lock_dlm. */
+static const struct lm_lockops nolock_ops = {
+        .lm_proto_name = "lock_nolock",
+        .lm_mount = nolock_mount,
+};
+static struct lmh_wrapper nolock_proto  = {
+        .lw_list = LIST_HEAD_INIT(nolock_proto.lw_list),
+        .lw_ops = &nolock_ops,
+};
 static LIST_HEAD(lmh_list);
 static DEFINE_MUTEX(lmh_lock);
+static int nolock_mount(char *table_name, char *host_data,
+                        lm_callback_t cb, void *cb_data,
+                        unsigned int min_lvb_size, int flags,
+                        struct lm_lockstruct *lockstruct,
+                        struct kobject *fskobj)
+{
+        char *c;
+        unsigned int jid;
+        c = strstr(host_data, "jid=");
+        if (!c)
+                jid = 0;
+        else {
+                c += 4;
+                sscanf(c, "%u", &jid);
+        }
+        lockstruct->ls_jid = jid;
+        lockstruct->ls_first = 1;
+        lockstruct->ls_lvb_size = min_lvb_size;
+        lockstruct->ls_ops = &nolock_ops;
+        lockstruct->ls_flags = LM_LSFLAG_LOCAL;
+        return 0;
+}
 /**
 * gfs2_register_lockproto - Register a low-level locking protocol
 * @proto: the protocol definition
@@ -116,9 +158,13 @@ int gfs2_mount_lockproto(char *proto_name, char *table_name, char *host_data,
        int try = 0;
        int error, found;
 retry:
        mutex_lock(&lmh_lock);
+        if (list_empty(&nolock_proto.lw_list))
+                list_add(&nolock_proto.lw_list, &lmh_list);
        found = 0;
        list_for_each_entry(lw, &lmh_list, lw_list) {
                if (!strcmp(lw->lw_ops->lm_proto_name, proto_name)) {
@@ -139,7 +185,8 @@ retry:
                goto out;
        }
-        if (!try_module_get(lw->lw_ops->lm_owner)) {
+        if (lw->lw_ops->lm_owner &&
+            !try_module_get(lw->lw_ops->lm_owner)) {
                try = 0;
                mutex_unlock(&lmh_lock);
                msleep(1000);
@@ -158,7 +205,8 @@ out:
 void gfs2_unmount_lockproto(struct lm_lockstruct *lockstruct)
 {
        mutex_lock(&lmh_lock);
-        lockstruct->ls_ops->lm_unmount(lockstruct->ls_lockspace);
+        if (lockstruct->ls_ops->lm_unmount)
+                lockstruct->ls_ops->lm_unmount(lockstruct->ls_lockspace);
        if (lockstruct->ls_ops->lm_owner)
                module_put(lockstruct->ls_ops->lm_owner);
        mutex_unlock(&lmh_lock);
diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c
index cf7ea8abec87..2482c9047505 100644
--- a/fs/gfs2/locking/dlm/lock.c
+++ b/fs/gfs2/locking/dlm/lock.c
@@ -11,46 +11,60 @@
 static char junk_lvb[GDLM_LVB_SIZE];
-static void queue_complete(struct gdlm_lock *lp)
+/* convert dlm lock-mode to gfs lock-state */
+static s16 gdlm_make_lmstate(s16 dlmmode)
 {
-        struct gdlm_ls *ls = lp->ls;
+        switch (dlmmode) {
+        case DLM_LOCK_IV:
+        case DLM_LOCK_NL:
+                return LM_ST_UNLOCKED;
+        case DLM_LOCK_EX:
+                return LM_ST_EXCLUSIVE;
+        case DLM_LOCK_CW:
+                return LM_ST_DEFERRED;
+        case DLM_LOCK_PR:
+                return LM_ST_SHARED;
+        }
+        gdlm_assert(0, "unknown DLM mode %d", dlmmode);
+        return -1;
+}
-        clear_bit(LFL_ACTIVE, &lp->flags);
+/* A lock placed on this queue is re-submitted to DLM as soon as the lock_dlm
+   thread gets to it. */
+static void queue_submit(struct gdlm_lock *lp)
+{
+        struct gdlm_ls *ls = lp->ls;
        spin_lock(&ls->async_lock);
-        list_add_tail(&lp->clist, &ls->complete);
+        list_add_tail(&lp->delay_list, &ls->submit);
        spin_unlock(&ls->async_lock);
        wake_up(&ls->thread_wait);
 }
-static inline void gdlm_ast(void *astarg)
+static void wake_up_ast(struct gdlm_lock *lp)
 {
-        queue_complete(astarg);
+        clear_bit(LFL_AST_WAIT, &lp->flags);
+        smp_mb__after_clear_bit();
+        wake_up_bit(&lp->flags, LFL_AST_WAIT);
 }
-static inline void gdlm_bast(void *astarg, int mode)
+static void gdlm_delete_lp(struct gdlm_lock *lp)
 {
-        struct gdlm_lock *lp = astarg;
        struct gdlm_ls *ls = lp->ls;
-        if (!mode) {
-                printk(KERN_INFO "lock_dlm: bast mode zero %x,%llx\n",
-                        lp->lockname.ln_type,
-                        (unsigned long long)lp->lockname.ln_number);
-                return;
-        }
        spin_lock(&ls->async_lock);
-        if (!lp->bast_mode) {
+        if (!list_empty(&lp->delay_list))
-                list_add_tail(&lp->blist, &ls->blocking);
+                list_del_init(&lp->delay_list);
-                lp->bast_mode = mode;
+        ls->all_locks_count--;
-        } else if (lp->bast_mode < mode)
-                lp->bast_mode = mode;
        spin_unlock(&ls->async_lock);
-        wake_up(&ls->thread_wait);
+        kfree(lp);
 }
-void gdlm_queue_delayed(struct gdlm_lock *lp)
+static void gdlm_queue_delayed(struct gdlm_lock *lp)
 {
        struct gdlm_ls *ls = lp->ls;
@@ -59,6 +73,236 @@ void gdlm_queue_delayed(struct gdlm_lock *lp)
        spin_unlock(&ls->async_lock);
 }
+static void process_complete(struct gdlm_lock *lp)
+{
+        struct gdlm_ls *ls = lp->ls;
+        struct lm_async_cb acb;
+        memset(&acb, 0, sizeof(acb));
+        if (lp->lksb.sb_status == -DLM_ECANCEL) {
+                log_info("complete dlm cancel %x,%llx flags %lx",
+                         lp->lockname.ln_type,
+                         (unsigned long long)lp->lockname.ln_number,
+                         lp->flags);
+                lp->req = lp->cur;
+                acb.lc_ret |= LM_OUT_CANCELED;
+                if (lp->cur == DLM_LOCK_IV)
+                        lp->lksb.sb_lkid = 0;
+                goto out;
+        }
+        if (test_and_clear_bit(LFL_DLM_UNLOCK, &lp->flags)) {
+                if (lp->lksb.sb_status != -DLM_EUNLOCK) {
+                        log_info("unlock sb_status %d %x,%llx flags %lx",
+                                 lp->lksb.sb_status, lp->lockname.ln_type,
+                                 (unsigned long long)lp->lockname.ln_number,
+                                 lp->flags);
+                        return;
+                }
+                lp->cur = DLM_LOCK_IV;
+                lp->req = DLM_LOCK_IV;
+                lp->lksb.sb_lkid = 0;
+                if (test_and_clear_bit(LFL_UNLOCK_DELETE, &lp->flags)) {
+                        gdlm_delete_lp(lp);
+                        return;
+                }
+                goto out;
+        }
+        if (lp->lksb.sb_flags & DLM_SBF_VALNOTVALID)
+                memset(lp->lksb.sb_lvbptr, 0, GDLM_LVB_SIZE);
+        if (lp->lksb.sb_flags & DLM_SBF_ALTMODE) {
+                if (lp->req == DLM_LOCK_PR)
+                        lp->req = DLM_LOCK_CW;
+                else if (lp->req == DLM_LOCK_CW)
+                        lp->req = DLM_LOCK_PR;
+        }
+        /*
+         * A canceled lock request.  The lock was just taken off the delayed
+         * list and was never even submitted to dlm.
+         */
+        if (test_and_clear_bit(LFL_CANCEL, &lp->flags)) {
+                log_info("complete internal cancel %x,%llx",
+                         lp->lockname.ln_type,
+                         (unsigned long long)lp->lockname.ln_number);
+                lp->req = lp->cur;
+                acb.lc_ret |= LM_OUT_CANCELED;
+                goto out;
+        }
+        /*
+         * An error occured.
+         */
+        if (lp->lksb.sb_status) {
+                /* a "normal" error */
+                if ((lp->lksb.sb_status == -EAGAIN) &&
+                    (lp->lkf & DLM_LKF_NOQUEUE)) {
+                        lp->req = lp->cur;
+                        if (lp->cur == DLM_LOCK_IV)
+                                lp->lksb.sb_lkid = 0;
+                        goto out;
+                }
+                /* this could only happen with cancels I think */
+                log_info("ast sb_status %d %x,%llx flags %lx",
+                         lp->lksb.sb_status, lp->lockname.ln_type,
+                         (unsigned long long)lp->lockname.ln_number,
+                         lp->flags);
+                return;
+        }
+        /*
+         * This is an AST for an EX->EX conversion for sync_lvb from GFS.
+         */
+        if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) {
+                wake_up_ast(lp);
+                return;
+        }
+        /*
+         * A lock has been demoted to NL because it initially completed during
+         * BLOCK_LOCKS.  Now it must be requested in the originally requested
+         * mode.
+         */
+        if (test_and_clear_bit(LFL_REREQUEST, &lp->flags)) {
+                gdlm_assert(lp->req == DLM_LOCK_NL, "%x,%llx",
+                            lp->lockname.ln_type,
+                            (unsigned long long)lp->lockname.ln_number);
+                gdlm_assert(lp->prev_req > DLM_LOCK_NL, "%x,%llx",
+                            lp->lockname.ln_type,
+                            (unsigned long long)lp->lockname.ln_number);
+                lp->cur = DLM_LOCK_NL;
+                lp->req = lp->prev_req;
+                lp->prev_req = DLM_LOCK_IV;
+                lp->lkf &= ~DLM_LKF_CONVDEADLK;
+                set_bit(LFL_NOCACHE, &lp->flags);
+                if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
+                    !test_bit(LFL_NOBLOCK, &lp->flags))
+                        gdlm_queue_delayed(lp);
+                else
+                        queue_submit(lp);
+                return;
+        }
+        /*
+         * A request is granted during dlm recovery.  It may be granted
+         * because the locks of a failed node were cleared.  In that case,
+         * there may be inconsistent data beneath this lock and we must wait
+         * for recovery to complete to use it.  When gfs recovery is done this
+         * granted lock will be converted to NL and then reacquired in this
+         * granted state.
+         */
+        if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
+            !test_bit(LFL_NOBLOCK, &lp->flags) &&
+            lp->req != DLM_LOCK_NL) {
+                lp->cur = lp->req;
+                lp->prev_req = lp->req;
+                lp->req = DLM_LOCK_NL;
+                lp->lkf |= DLM_LKF_CONVERT;
+                lp->lkf &= ~DLM_LKF_CONVDEADLK;
+                log_debug("rereq %x,%llx id %x %d,%d",
+                          lp->lockname.ln_type,
+                          (unsigned long long)lp->lockname.ln_number,
+                          lp->lksb.sb_lkid, lp->cur, lp->req);
+                set_bit(LFL_REREQUEST, &lp->flags);
+                queue_submit(lp);
+                return;
+        }
+        /*
+         * DLM demoted the lock to NL before it was granted so GFS must be
+         * told it cannot cache data for this lock.
+         */
+        if (lp->lksb.sb_flags & DLM_SBF_DEMOTED)
+                set_bit(LFL_NOCACHE, &lp->flags);
+out:
+        /*
+         * This is an internal lock_dlm lock
+         */
+        if (test_bit(LFL_INLOCK, &lp->flags)) {
+                clear_bit(LFL_NOBLOCK, &lp->flags);
+                lp->cur = lp->req;
+                wake_up_ast(lp);
+                return;
+        }
+        /*
+         * Normal completion of a lock request.  Tell GFS it now has the lock.
+         */
+        clear_bit(LFL_NOBLOCK, &lp->flags);
+        lp->cur = lp->req;
+        acb.lc_name = lp->lockname;
+        acb.lc_ret |= gdlm_make_lmstate(lp->cur);
+        ls->fscb(ls->sdp, LM_CB_ASYNC, &acb);
+}
+static void gdlm_ast(void *astarg)
+{
+        struct gdlm_lock *lp = astarg;
+        clear_bit(LFL_ACTIVE, &lp->flags);
+        process_complete(lp);
+}
+static void process_blocking(struct gdlm_lock *lp, int bast_mode)
+{
+        struct gdlm_ls *ls = lp->ls;
+        unsigned int cb = 0;
+        switch (gdlm_make_lmstate(bast_mode)) {
+        case LM_ST_EXCLUSIVE:
+                cb = LM_CB_NEED_E;
+                break;
+        case LM_ST_DEFERRED:
+                cb = LM_CB_NEED_D;
+                break;
+        case LM_ST_SHARED:
+                cb = LM_CB_NEED_S;
+                break;
+        default:
+                gdlm_assert(0, "unknown bast mode %u", bast_mode);
+        }
+        ls->fscb(ls->sdp, cb, &lp->lockname);
+}
+static void gdlm_bast(void *astarg, int mode)
+{
+        struct gdlm_lock *lp = astarg;
+        if (!mode) {
+                printk(KERN_INFO "lock_dlm: bast mode zero %x,%llx\n",
+                        lp->lockname.ln_type,
+                        (unsigned long long)lp->lockname.ln_number);
+                return;
+        }
+        process_blocking(lp, mode);
+}
 /* convert gfs lock-state to dlm lock-mode */
 static s16 make_mode(s16 lmstate)
@@ -77,24 +321,6 @@ static s16 make_mode(s16 lmstate)
        return -1;
 }
-/* convert dlm lock-mode to gfs lock-state */
-s16 gdlm_make_lmstate(s16 dlmmode)
-{
-        switch (dlmmode) {
-        case DLM_LOCK_IV:
-        case DLM_LOCK_NL:
-                return LM_ST_UNLOCKED;
-        case DLM_LOCK_EX:
-                return LM_ST_EXCLUSIVE;
-        case DLM_LOCK_CW:
-                return LM_ST_DEFERRED;
-        case DLM_LOCK_PR:
-                return LM_ST_SHARED;
-        }
-        gdlm_assert(0, "unknown DLM mode %d", dlmmode);
-        return -1;
-}
 /* verify agreement with GFS on the current lock state, NB: DLM_LOCK_NL and
   DLM_LOCK_IV are both considered LM_ST_UNLOCKED by GFS. */
@@ -134,14 +360,6 @@ static inline unsigned int make_flags(struct gdlm_lock *lp,
        if (lp->lksb.sb_lkid != 0) {
                lkf |= DLM_LKF_CONVERT;
-                /* Conversion deadlock avoidance by DLM */
-                if (!(lp->ls->fsflags & LM_MFLAG_CONV_NODROP) &&
-                    !test_bit(LFL_FORCE_PROMOTE, &lp->flags) &&
-                    !(lkf & DLM_LKF_NOQUEUE) &&
-                    cur > DLM_LOCK_NL && req > DLM_LOCK_NL && cur != req)
-                        lkf |= DLM_LKF_CONVDEADLK;
        }
        if (lp->lvb)
@@ -173,14 +391,9 @@ static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
        make_strname(name, &lp->strname);
        lp->ls = ls;
        lp->cur = DLM_LOCK_IV;
-        lp->lvb = NULL;
-        lp->hold_null = NULL;
-        INIT_LIST_HEAD(&lp->clist);
-        INIT_LIST_HEAD(&lp->blist);
        INIT_LIST_HEAD(&lp->delay_list);
        spin_lock(&ls->async_lock);
-        list_add(&lp->all_list, &ls->all_locks);
        ls->all_locks_count++;
        spin_unlock(&ls->async_lock);
@@ -188,26 +401,6 @@ static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
        return 0;
 }
-void gdlm_delete_lp(struct gdlm_lock *lp)
-{
-        struct gdlm_ls *ls = lp->ls;
-        spin_lock(&ls->async_lock);
-        if (!list_empty(&lp->clist))
-                list_del_init(&lp->clist);
-        if (!list_empty(&lp->blist))
-                list_del_init(&lp->blist);
-        if (!list_empty(&lp->delay_list))
-                list_del_init(&lp->delay_list);
-        gdlm_assert(!list_empty(&lp->all_list), "%x,%llx", lp->lockname.ln_type,
-                    (unsigned long long)lp->lockname.ln_number);
-        list_del_init(&lp->all_list);
-        ls->all_locks_count--;
-        spin_unlock(&ls->async_lock);
-        kfree(lp);
-}
 int gdlm_get_lock(void *lockspace, struct lm_lockname *name,
                  void **lockp)
 {
@@ -261,7 +454,7 @@ unsigned int gdlm_do_lock(struct gdlm_lock *lp)
        if ((error == -EAGAIN) && (lp->lkf & DLM_LKF_NOQUEUE)) {
                lp->lksb.sb_status = -EAGAIN;
-                queue_complete(lp);
+                gdlm_ast(lp);
                error = 0;
        }
@@ -308,6 +501,12 @@ unsigned int gdlm_lock(void *lock, unsigned int cur_state,
 {
        struct gdlm_lock *lp = lock;
+        if (req_state == LM_ST_UNLOCKED)
+                return gdlm_unlock(lock, cur_state);
+        if (req_state == LM_ST_UNLOCKED)
+                return gdlm_unlock(lock, cur_state);
        clear_bit(LFL_DLM_CANCEL, &lp->flags);
        if (flags & LM_FLAG_NOEXP)
                set_bit(LFL_NOBLOCK, &lp->flags);
@@ -351,7 +550,7 @@ void gdlm_cancel(void *lock)
        if (delay_list) {
                set_bit(LFL_CANCEL, &lp->flags);
                set_bit(LFL_ACTIVE, &lp->flags);
-                queue_complete(lp);
+                gdlm_ast(lp);
                return;
        }
@@ -507,22 +706,3 @@ void gdlm_submit_delayed(struct gdlm_ls *ls)
        wake_up(&ls->thread_wait);
 }
-int gdlm_release_all_locks(struct gdlm_ls *ls)
-{
-        struct gdlm_lock *lp, *safe;
-        int count = 0;
-        spin_lock(&ls->async_lock);
-        list_for_each_entry_safe(lp, safe, &ls->all_locks, all_list) {
-                list_del_init(&lp->all_list);
-                if (lp->lvb && lp->lvb != junk_lvb)
-                        kfree(lp->lvb);
-                kfree(lp);
-                count++;
-        }
-        spin_unlock(&ls->async_lock);
-        return count;
-}
diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
index a243cf69c54e..3c98e7c6f93b 100644
--- a/fs/gfs2/locking/dlm/lock_dlm.h
+++ b/fs/gfs2/locking/dlm/lock_dlm.h
@@ -72,19 +72,12 @@ struct gdlm_ls {
        int                     recover_jid_done;
        int                     recover_jid_status;
        spinlock_t              async_lock;
-        struct list_head        complete;
-        struct list_head        blocking;
        struct list_head        delayed;
        struct list_head        submit;
-        struct list_head        all_locks;
        u32             all_locks_count;
        wait_queue_head_t       wait_control;
-        struct task_struct      *thread1;
+        struct task_struct      *thread;
-        struct task_struct      *thread2;
        wait_queue_head_t       thread_wait;
-        unsigned long           drop_time;
-        int                     drop_locks_count;
-        int                     drop_locks_period;
 };
 enum {
@@ -117,12 +110,7 @@ struct gdlm_lock {
        u32                     lkf;            /* dlm flags DLM_LKF_ */
        unsigned long           flags;          /* lock_dlm flags LFL_ */
-        int                     bast_mode;      /* protected by async_lock */
-        struct list_head        clist;          /* complete */
-        struct list_head        blist;          /* blocking */
        struct list_head        delay_list;     /* delayed */
-        struct list_head        all_list;       /* all locks for the fs */
        struct gdlm_lock        *hold_null;     /* NL lock for hold_lvb */
 };
@@ -159,11 +147,7 @@ void gdlm_release_threads(struct gdlm_ls *);
 /* lock.c */
-s16 gdlm_make_lmstate(s16);
-void gdlm_queue_delayed(struct gdlm_lock *);
 void gdlm_submit_delayed(struct gdlm_ls *);
-int gdlm_release_all_locks(struct gdlm_ls *);
-void gdlm_delete_lp(struct gdlm_lock *);
 unsigned int gdlm_do_lock(struct gdlm_lock *);
 int gdlm_get_lock(void *, struct lm_lockname *, void **);
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
index 470bdf650b50..09d78c216f48 100644
--- a/fs/gfs2/locking/dlm/mount.c
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -22,22 +22,14 @@ static struct gdlm_ls *init_gdlm(lm_callback_t cb, struct gfs2_sbd *sdp,
        if (!ls)
                return NULL;
-        ls->drop_locks_count = GDLM_DROP_COUNT;
-        ls->drop_locks_period = GDLM_DROP_PERIOD;
        ls->fscb = cb;
        ls->sdp = sdp;
        ls->fsflags = flags;
        spin_lock_init(&ls->async_lock);
-        INIT_LIST_HEAD(&ls->complete);
-        INIT_LIST_HEAD(&ls->blocking);
        INIT_LIST_HEAD(&ls->delayed);
        INIT_LIST_HEAD(&ls->submit);
-        INIT_LIST_HEAD(&ls->all_locks);
        init_waitqueue_head(&ls->thread_wait);
        init_waitqueue_head(&ls->wait_control);
-        ls->thread1 = NULL;
-        ls->thread2 = NULL;
-        ls->drop_time = jiffies;
        ls->jid = -1;
        strncpy(buf, table_name, 256);
@@ -180,7 +172,6 @@ out:
 static void gdlm_unmount(void *lockspace)
 {
        struct gdlm_ls *ls = lockspace;
-        int rv;
        log_debug("unmount flags %lx", ls->flags);
@@ -194,9 +185,7 @@ static void gdlm_unmount(void *lockspace)
        gdlm_kobject_release(ls);
        dlm_release_lockspace(ls->dlm_lockspace, 2);
        gdlm_release_threads(ls);
-        rv = gdlm_release_all_locks(ls);
+        BUG_ON(ls->all_locks_count);
-        if (rv)
-                log_info("gdlm_unmount: %d stray locks freed", rv);
 out:
        kfree(ls);
 }
@@ -232,7 +221,6 @@ static void gdlm_withdraw(void *lockspace)
        dlm_release_lockspace(ls->dlm_lockspace, 2);
        gdlm_release_threads(ls);
-        gdlm_release_all_locks(ls);
        gdlm_kobject_release(ls);
 }
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index a4ff271df9ee..4ec571c3d8a9 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -114,17 +114,6 @@ static ssize_t recover_status_show(struct gdlm_ls *ls, char *buf)
        return sprintf(buf, "%d\n", ls->recover_jid_status);
 }
-static ssize_t drop_count_show(struct gdlm_ls *ls, char *buf)
-{
-        return sprintf(buf, "%d\n", ls->drop_locks_count);
-}
-static ssize_t drop_count_store(struct gdlm_ls *ls, const char *buf, size_t len)
-{
-        ls->drop_locks_count = simple_strtol(buf, NULL, 0);
-        return len;
-}
 struct gdlm_attr {
        struct attribute attr;
        ssize_t (*show)(struct gdlm_ls *, char *);
@@ -144,7 +133,6 @@ GDLM_ATTR(first_done,     0444, first_done_show,     NULL);
 GDLM_ATTR(recover,        0644, recover_show,        recover_store);
 GDLM_ATTR(recover_done,   0444, recover_done_show,   NULL);
 GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
-GDLM_ATTR(drop_count,     0644, drop_count_show,     drop_count_store);
 static struct attribute *gdlm_attrs[] = {
        &gdlm_attr_proto_name.attr,
@@ -157,7 +145,6 @@ static struct attribute *gdlm_attrs[] = {
        &gdlm_attr_recover.attr,
        &gdlm_attr_recover_done.attr,
        &gdlm_attr_recover_status.attr,
-        &gdlm_attr_drop_count.attr,
        NULL,
 };
diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c
index e53db6fd28ab..38823efd698c 100644
--- a/fs/gfs2/locking/dlm/thread.c
+++ b/fs/gfs2/locking/dlm/thread.c
@@ -9,367 +9,60 @@
 #include "lock_dlm.h"
-/* A lock placed on this queue is re-submitted to DLM as soon as the lock_dlm
+static inline int no_work(struct gdlm_ls *ls)
-   thread gets to it. */
-static void queue_submit(struct gdlm_lock *lp)
-{
-        struct gdlm_ls *ls = lp->ls;
-        spin_lock(&ls->async_lock);
-        list_add_tail(&lp->delay_list, &ls->submit);
-        spin_unlock(&ls->async_lock);
-        wake_up(&ls->thread_wait);
-}
-static void process_blocking(struct gdlm_lock *lp, int bast_mode)
-{
-        struct gdlm_ls *ls = lp->ls;
-        unsigned int cb = 0;
-        switch (gdlm_make_lmstate(bast_mode)) {
-        case LM_ST_EXCLUSIVE:
-                cb = LM_CB_NEED_E;
-                break;
-        case LM_ST_DEFERRED:
-                cb = LM_CB_NEED_D;
-                break;
-        case LM_ST_SHARED:
-                cb = LM_CB_NEED_S;
-                break;
-        default:
-                gdlm_assert(0, "unknown bast mode %u", lp->bast_mode);
-        }
-        ls->fscb(ls->sdp, cb, &lp->lockname);
-}
-static void wake_up_ast(struct gdlm_lock *lp)
-{
-        clear_bit(LFL_AST_WAIT, &lp->flags);
-        smp_mb__after_clear_bit();
-        wake_up_bit(&lp->flags, LFL_AST_WAIT);
-}
-static void process_complete(struct gdlm_lock *lp)
-{
-        struct gdlm_ls *ls = lp->ls;
-        struct lm_async_cb acb;
-        s16 prev_mode = lp->cur;
-        memset(&acb, 0, sizeof(acb));
-        if (lp->lksb.sb_status == -DLM_ECANCEL) {
-                log_info("complete dlm cancel %x,%llx flags %lx",
-                         lp->lockname.ln_type,
-                         (unsigned long long)lp->lockname.ln_number,
-                         lp->flags);
-                lp->req = lp->cur;
-                acb.lc_ret |= LM_OUT_CANCELED;
-                if (lp->cur == DLM_LOCK_IV)
-                        lp->lksb.sb_lkid = 0;
-                goto out;
-        }
-        if (test_and_clear_bit(LFL_DLM_UNLOCK, &lp->flags)) {
-                if (lp->lksb.sb_status != -DLM_EUNLOCK) {
-                        log_info("unlock sb_status %d %x,%llx flags %lx",
-                                 lp->lksb.sb_status, lp->lockname.ln_type,
-                                 (unsigned long long)lp->lockname.ln_number,
-                                 lp->flags);
-                        return;
-                }
-                lp->cur = DLM_LOCK_IV;
-                lp->req = DLM_LOCK_IV;
-                lp->lksb.sb_lkid = 0;
-                if (test_and_clear_bit(LFL_UNLOCK_DELETE, &lp->flags)) {
-                        gdlm_delete_lp(lp);
-                        return;
-                }
-                goto out;
-        }
-        if (lp->lksb.sb_flags & DLM_SBF_VALNOTVALID)
-                memset(lp->lksb.sb_lvbptr, 0, GDLM_LVB_SIZE);
-        if (lp->lksb.sb_flags & DLM_SBF_ALTMODE) {
-                if (lp->req == DLM_LOCK_PR)
-                        lp->req = DLM_LOCK_CW;
-                else if (lp->req == DLM_LOCK_CW)
-                        lp->req = DLM_LOCK_PR;
-        }
-        /*
-         * A canceled lock request.  The lock was just taken off the delayed
-         * list and was never even submitted to dlm.
-         */
-        if (test_and_clear_bit(LFL_CANCEL, &lp->flags)) {
-                log_info("complete internal cancel %x,%llx",
-                         lp->lockname.ln_type,
-                         (unsigned long long)lp->lockname.ln_number);
-                lp->req = lp->cur;
-                acb.lc_ret |= LM_OUT_CANCELED;
-                goto out;
-        }
-        /*
-         * An error occured.
-         */
-        if (lp->lksb.sb_status) {
-                /* a "normal" error */
-                if ((lp->lksb.sb_status == -EAGAIN) &&
-                    (lp->lkf & DLM_LKF_NOQUEUE)) {
-                        lp->req = lp->cur;
-                        if (lp->cur == DLM_LOCK_IV)
-                                lp->lksb.sb_lkid = 0;
-                        goto out;
-                }
-                /* this could only happen with cancels I think */
-                log_info("ast sb_status %d %x,%llx flags %lx",
-                         lp->lksb.sb_status, lp->lockname.ln_type,
-                         (unsigned long long)lp->lockname.ln_number,
-                         lp->flags);
-                if (lp->lksb.sb_status == -EDEADLOCK &&
-                    lp->ls->fsflags & LM_MFLAG_CONV_NODROP) {
-                        lp->req = lp->cur;
-                        acb.lc_ret |= LM_OUT_CONV_DEADLK;
-                        if (lp->cur == DLM_LOCK_IV)
-                                lp->lksb.sb_lkid = 0;
-                        goto out;
-                } else
-                        return;
-        }
-        /*
-         * This is an AST for an EX->EX conversion for sync_lvb from GFS.
-         */
-        if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) {
-                wake_up_ast(lp);
-                return;
-        }
-        /*
-         * A lock has been demoted to NL because it initially completed during
-         * BLOCK_LOCKS.  Now it must be requested in the originally requested
-         * mode.
-         */
-        if (test_and_clear_bit(LFL_REREQUEST, &lp->flags)) {
-                gdlm_assert(lp->req == DLM_LOCK_NL, "%x,%llx",
-                            lp->lockname.ln_type,
-                            (unsigned long long)lp->lockname.ln_number);
-                gdlm_assert(lp->prev_req > DLM_LOCK_NL, "%x,%llx",
-                            lp->lockname.ln_type,
-                            (unsigned long long)lp->lockname.ln_number);
-                lp->cur = DLM_LOCK_NL;
-                lp->req = lp->prev_req;
-                lp->prev_req = DLM_LOCK_IV;
-                lp->lkf &= ~DLM_LKF_CONVDEADLK;
-                set_bit(LFL_NOCACHE, &lp->flags);
-                if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
-                    !test_bit(LFL_NOBLOCK, &lp->flags))
-                        gdlm_queue_delayed(lp);
-                else
-                        queue_submit(lp);
-                return;
-        }
-        /*
-         * A request is granted during dlm recovery.  It may be granted
-         * because the locks of a failed node were cleared.  In that case,
-         * there may be inconsistent data beneath this lock and we must wait
-         * for recovery to complete to use it.  When gfs recovery is done this
-         * granted lock will be converted to NL and then reacquired in this
-         * granted state.
-         */
-        if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
-            !test_bit(LFL_NOBLOCK, &lp->flags) &&
-            lp->req != DLM_LOCK_NL) {
-                lp->cur = lp->req;
-                lp->prev_req = lp->req;
-                lp->req = DLM_LOCK_NL;
-                lp->lkf |= DLM_LKF_CONVERT;
-                lp->lkf &= ~DLM_LKF_CONVDEADLK;
-                log_debug("rereq %x,%llx id %x %d,%d",
-                          lp->lockname.ln_type,
-                          (unsigned long long)lp->lockname.ln_number,
-                          lp->lksb.sb_lkid, lp->cur, lp->req);
-                set_bit(LFL_REREQUEST, &lp->flags);
-                queue_submit(lp);
-                return;
-        }
-        /*
-         * DLM demoted the lock to NL before it was granted so GFS must be
-         * told it cannot cache data for this lock.
-         */
-        if (lp->lksb.sb_flags & DLM_SBF_DEMOTED)
-                set_bit(LFL_NOCACHE, &lp->flags);
-out:
-        /*
-         * This is an internal lock_dlm lock
-         */
-        if (test_bit(LFL_INLOCK, &lp->flags)) {
-                clear_bit(LFL_NOBLOCK, &lp->flags);
-                lp->cur = lp->req;
-                wake_up_ast(lp);
-                return;
-        }
-        /*
-         * Normal completion of a lock request.  Tell GFS it now has the lock.
-         */
-        clear_bit(LFL_NOBLOCK, &lp->flags);
-        lp->cur = lp->req;
-        acb.lc_name = lp->lockname;
-        acb.lc_ret |= gdlm_make_lmstate(lp->cur);
-        if (!test_and_clear_bit(LFL_NOCACHE, &lp->flags) &&
-            (lp->cur > DLM_LOCK_NL) && (prev_mode > DLM_LOCK_NL))
-                acb.lc_ret |= LM_OUT_CACHEABLE;
-        ls->fscb(ls->sdp, LM_CB_ASYNC, &acb);
-}
-static inline int no_work(struct gdlm_ls *ls, int blocking)
 {
        int ret;
        spin_lock(&ls->async_lock);
-        ret = list_empty(&ls->complete) && list_empty(&ls->submit);
+        ret = list_empty(&ls->submit);
-        if (ret && blocking)
-                ret = list_empty(&ls->blocking);
        spin_unlock(&ls->async_lock);
        return ret;
 }
-static inline int check_drop(struct gdlm_ls *ls)
+static int gdlm_thread(void *data)
-{
-        if (!ls->drop_locks_count)
-                return 0;
-        if (time_after(jiffies, ls->drop_time + ls->drop_locks_period * HZ)) {
-                ls->drop_time = jiffies;
-                if (ls->all_locks_count >= ls->drop_locks_count)
-                        return 1;
-        }
-        return 0;
-}
-static int gdlm_thread(void *data, int blist)
 {
        struct gdlm_ls *ls = (struct gdlm_ls *) data;
        struct gdlm_lock *lp = NULL;
-        uint8_t complete, blocking, submit, drop;
-        /* Only thread1 is allowed to do blocking callbacks since gfs
-           may wait for a completion callback within a blocking cb. */
        while (!kthread_should_stop()) {
                wait_event_interruptible(ls->thread_wait,
-                                !no_work(ls, blist) || kthread_should_stop());
+                                !no_work(ls) || kthread_should_stop());
-                complete = blocking = submit = drop = 0;
                spin_lock(&ls->async_lock);
-                if (blist && !list_empty(&ls->blocking)) {
+                if (!list_empty(&ls->submit)) {
-                        lp = list_entry(ls->blocking.next, struct gdlm_lock,
-                                        blist);
-                        list_del_init(&lp->blist);
-                        blocking = lp->bast_mode;
-                        lp->bast_mode = 0;
-                } else if (!list_empty(&ls->complete)) {
-                        lp = list_entry(ls->complete.next, struct gdlm_lock,
-                                        clist);
-                        list_del_init(&lp->clist);
-                        complete = 1;
-                } else if (!list_empty(&ls->submit)) {
                        lp = list_entry(ls->submit.next, struct gdlm_lock,
                                        delay_list);
                        list_del_init(&lp->delay_list);
-                        submit = 1;
+                        spin_unlock(&ls->async_lock);
+                        gdlm_do_lock(lp);
+                        spin_lock(&ls->async_lock);
                }
-                drop = check_drop(ls);
                spin_unlock(&ls->async_lock);
-                if (complete)
-                        process_complete(lp);
-                else if (blocking)
-                        process_blocking(lp, blocking);
-                else if (submit)
-                        gdlm_do_lock(lp);
-                if (drop)
-                        ls->fscb(ls->sdp, LM_CB_DROPLOCKS, NULL);
-                schedule();
        }
        return 0;
 }
-static int gdlm_thread1(void *data)
-{
-        return gdlm_thread(data, 1);
-}
-static int gdlm_thread2(void *data)
-{
-        return gdlm_thread(data, 0);
-}
 int gdlm_init_threads(struct gdlm_ls *ls)
 {
        struct task_struct *p;
        int error;
-        p = kthread_run(gdlm_thread1, ls, "lock_dlm1");
+        p = kthread_run(gdlm_thread, ls, "lock_dlm");
-        error = IS_ERR(p);
-        if (error) {
-                log_error("can't start lock_dlm1 thread %d", error);
-                return error;
-        }
-        ls->thread1 = p;
-        p = kthread_run(gdlm_thread2, ls, "lock_dlm2");
        error = IS_ERR(p);
        if (error) {
-                log_error("can't start lock_dlm2 thread %d", error);
+                log_error("can't start lock_dlm thread %d", error);
-                kthread_stop(ls->thread1);
                return error;
        }
-        ls->thread2 = p;
+        ls->thread = p;
        return 0;
 }
 void gdlm_release_threads(struct gdlm_ls *ls)
 {
-        kthread_stop(ls->thread1);
+        kthread_stop(ls->thread);
-        kthread_stop(ls->thread2);
 }
diff --git a/fs/gfs2/locking/nolock/Makefile b/fs/gfs2/locking/nolock/Makefile
deleted file mode 100644
index 35e9730bc3a8..000000000000
--- a/fs/gfs2/locking/nolock/Makefile
+++ /dev/null
@@ -1,3 +0,0 @@
-obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += lock_nolock.o
-lock_nolock-y := main.o
diff --git a/fs/gfs2/locking/nolock/main.c b/fs/gfs2/locking/nolock/main.c
deleted file mode 100644
index 284a5ece8d94..000000000000
--- a/fs/gfs2/locking/nolock/main.c
+++ /dev/null
@@ -1,238 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/fs.h>
-#include <linux/lm_interface.h>
-struct nolock_lockspace {
-        unsigned int nl_lvb_size;
-};
-static const struct lm_lockops nolock_ops;
-static int nolock_mount(char *table_name, char *host_data,
-                        lm_callback_t cb, void *cb_data,
-                        unsigned int min_lvb_size, int flags,
-                        struct lm_lockstruct *lockstruct,
-                        struct kobject *fskobj)
-{
-        char *c;
-        unsigned int jid;
-        struct nolock_lockspace *nl;
-        c = strstr(host_data, "jid=");
-        if (!c)
-                jid = 0;
-        else {
-                c += 4;
-                sscanf(c, "%u", &jid);
-        }
-        nl = kzalloc(sizeof(struct nolock_lockspace), GFP_KERNEL);
-        if (!nl)
-                return -ENOMEM;
-        nl->nl_lvb_size = min_lvb_size;
-        lockstruct->ls_jid = jid;
-        lockstruct->ls_first = 1;
-        lockstruct->ls_lvb_size = min_lvb_size;
-        lockstruct->ls_lockspace = nl;
-        lockstruct->ls_ops = &nolock_ops;
-        lockstruct->ls_flags = LM_LSFLAG_LOCAL;
-        return 0;
-}
-static void nolock_others_may_mount(void *lockspace)
-{
-}
-static void nolock_unmount(void *lockspace)
-{
-        struct nolock_lockspace *nl = lockspace;
-        kfree(nl);
-}
-static void nolock_withdraw(void *lockspace)
-{
-}
-/**
- * nolock_get_lock - get a lm_lock_t given a descripton of the lock
- * @lockspace: the lockspace the lock lives in
- * @name: the name of the lock
- * @lockp: return the lm_lock_t here
- *
- * Returns: 0 on success, -EXXX on failure
- */
-static int nolock_get_lock(void *lockspace, struct lm_lockname *name,
-                           void **lockp)
-{
-        *lockp = lockspace;
-        return 0;
-}
-/**
- * nolock_put_lock - get rid of a lock structure
- * @lock: the lock to throw away
- *
- */
-static void nolock_put_lock(void *lock)
-{
-}
-/**
- * nolock_lock - acquire a lock
- * @lock: the lock to manipulate
- * @cur_state: the current state
- * @req_state: the requested state
- * @flags: modifier flags
- *
- * Returns: A bitmap of LM_OUT_*
- */
-static unsigned int nolock_lock(void *lock, unsigned int cur_state,
-                                unsigned int req_state, unsigned int flags)
-{
-        return req_state | LM_OUT_CACHEABLE;
-}
-/**
- * nolock_unlock - unlock a lock
- * @lock: the lock to manipulate
- * @cur_state: the current state
- *
- * Returns: 0
- */
-static unsigned int nolock_unlock(void *lock, unsigned int cur_state)
-{
-        return 0;
-}
-static void nolock_cancel(void *lock)
-{
-}
-/**
- * nolock_hold_lvb - hold on to a lock value block
- * @lock: the lock the LVB is associated with
- * @lvbp: return the lm_lvb_t here
- *
- * Returns: 0 on success, -EXXX on failure
- */
-static int nolock_hold_lvb(void *lock, char **lvbp)
-{
-        struct nolock_lockspace *nl = lock;
-        int error = 0;
-        *lvbp = kzalloc(nl->nl_lvb_size, GFP_NOFS);
-        if (!*lvbp)
-                error = -ENOMEM;
-        return error;
-}
-/**
- * nolock_unhold_lvb - release a LVB
- * @lock: the lock the LVB is associated with
- * @lvb: the lock value block
- *
- */
-static void nolock_unhold_lvb(void *lock, char *lvb)
-{
-        kfree(lvb);
-}
-static int nolock_plock_get(void *lockspace, struct lm_lockname *name,
-                            struct file *file, struct file_lock *fl)
-{
-        posix_test_lock(file, fl);
-        return 0;
-}
-static int nolock_plock(void *lockspace, struct lm_lockname *name,
-                        struct file *file, int cmd, struct file_lock *fl)
-{
-        int error;
-        error = posix_lock_file_wait(file, fl);
-        return error;
-}
-static int nolock_punlock(void *lockspace, struct lm_lockname *name,
-                          struct file *file, struct file_lock *fl)
-{
-        int error;
-        error = posix_lock_file_wait(file, fl);
-        return error;
-}
-static void nolock_recovery_done(void *lockspace, unsigned int jid,
-                                 unsigned int message)
-{
-}
-static const struct lm_lockops nolock_ops = {
-        .lm_proto_name = "lock_nolock",
-        .lm_mount = nolock_mount,
-        .lm_others_may_mount = nolock_others_may_mount,
-        .lm_unmount = nolock_unmount,
-        .lm_withdraw = nolock_withdraw,
-        .lm_get_lock = nolock_get_lock,
-        .lm_put_lock = nolock_put_lock,
-        .lm_lock = nolock_lock,
-        .lm_unlock = nolock_unlock,
-        .lm_cancel = nolock_cancel,
-        .lm_hold_lvb = nolock_hold_lvb,
-        .lm_unhold_lvb = nolock_unhold_lvb,
-        .lm_plock_get = nolock_plock_get,
-        .lm_plock = nolock_plock,
-        .lm_punlock = nolock_punlock,
-        .lm_recovery_done = nolock_recovery_done,
-        .lm_owner = THIS_MODULE,
-};
-static int __init init_nolock(void)
-{
-        int error;
-        error = gfs2_register_lockproto(&nolock_ops);
-        if (error) {
-                printk(KERN_WARNING
-                       "lock_nolock: can't register protocol: %d\n", error);
-                return error;
-        }
-        printk(KERN_INFO
-               "Lock_Nolock (built %s %s) installed\n", __DATE__, __TIME__);
-        return 0;
-}
-static void __exit exit_nolock(void)
-{
-        gfs2_unregister_lockproto(&nolock_ops);
-}
-module_init(init_nolock);
-module_exit(exit_nolock);
-MODULE_DESCRIPTION("GFS Nolock Locking Module");
-MODULE_AUTHOR("Red Hat, Inc.");
-MODULE_LICENSE("GPL");
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 548264b1836d..6c6af9f5e3ab 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -87,6 +87,8 @@ void gfs2_remove_from_ail(struct gfs2_bufdata *bd)
 */
 static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+__releases(&sdp->sd_log_lock)
+__acquires(&sdp->sd_log_lock)
 {
        struct gfs2_bufdata *bd, *s;
        struct buffer_head *bh;
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 771152816508..7c64510ccfd2 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -21,6 +21,7 @@
 */
 static inline void gfs2_log_lock(struct gfs2_sbd *sdp)
+__acquires(&sdp->sd_log_lock)
 {
        spin_lock(&sdp->sd_log_lock);
 }
@@ -32,6 +33,7 @@ static inline void gfs2_log_lock(struct gfs2_sbd *sdp)
 */
 static inline void gfs2_log_unlock(struct gfs2_sbd *sdp)
+__releases(&sdp->sd_log_lock)
 {
        spin_unlock(&sdp->sd_log_lock);
 }
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 053e2ebbbd50..bcc668d0fadd 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -40,8 +40,6 @@ static void gfs2_init_glock_once(struct kmem_cache *cachep, void *foo)
        INIT_HLIST_NODE(&gl->gl_list);
        spin_lock_init(&gl->gl_spin);
        INIT_LIST_HEAD(&gl->gl_holders);
-        INIT_LIST_HEAD(&gl->gl_waiters1);
-        INIT_LIST_HEAD(&gl->gl_waiters3);
        gl->gl_lvb = NULL;
        atomic_set(&gl->gl_lvb_count, 0);
        INIT_LIST_HEAD(&gl->gl_reclaim);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 78d75f892f82..09853620c951 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -129,7 +129,7 @@ void gfs2_meta_sync(struct gfs2_glock *gl)
 }
 /**
- * getbuf - Get a buffer with a given address space
+ * gfs2_getbuf - Get a buffer with a given address space
 * @gl: the glock
 * @blkno: the block number (filesystem scope)
 * @create: 1 if the buffer should be created
@@ -137,7 +137,7 @@ void gfs2_meta_sync(struct gfs2_glock *gl)
 * Returns: the buffer
 */
-static struct buffer_head *getbuf(struct gfs2_glock *gl, u64 blkno, int create)
+struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
 {
        struct address_space *mapping = gl->gl_aspace->i_mapping;
        struct gfs2_sbd *sdp = gl->gl_sbd;
@@ -205,7 +205,7 @@ static void meta_prep_new(struct buffer_head *bh)
 struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
 {
        struct buffer_head *bh;
-        bh = getbuf(gl, blkno, CREATE);
+        bh = gfs2_getbuf(gl, blkno, CREATE);
        meta_prep_new(bh);
        return bh;
 }
@@ -223,7 +223,7 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
 int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
                   struct buffer_head **bhp)
 {
-        *bhp = getbuf(gl, blkno, CREATE);
+        *bhp = gfs2_getbuf(gl, blkno, CREATE);
        if (!buffer_uptodate(*bhp)) {
                ll_rw_block(READ_META, 1, bhp);
                if (flags & DIO_WAIT) {
@@ -346,7 +346,7 @@ void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen)
        struct buffer_head *bh;
        while (blen) {
-                bh = getbuf(ip->i_gl, bstart, NO_CREATE);
+                bh = gfs2_getbuf(ip->i_gl, bstart, NO_CREATE);
                if (bh) {
                        lock_buffer(bh);
                        gfs2_log_lock(sdp);
@@ -421,7 +421,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
        if (extlen > max_ra)
                extlen = max_ra;
-        first_bh = getbuf(gl, dblock, CREATE);
+        first_bh = gfs2_getbuf(gl, dblock, CREATE);
        if (buffer_uptodate(first_bh))
                goto out;
@@ -432,7 +432,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
        extlen--;
        while (extlen) {
-                bh = getbuf(gl, dblock, CREATE);
+                bh = gfs2_getbuf(gl, dblock, CREATE);
                if (!buffer_uptodate(bh) && !buffer_locked(bh))
                        ll_rw_block(READA, 1, &bh);
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index 73e3b1c76fe1..b1a5f3674d43 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -47,6 +47,7 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno);
 int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno,
                   int flags, struct buffer_head **bhp);
 int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
+struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create);
 void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
                         int meta);
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index f55394e57cb2..e64a1b04117a 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -499,34 +499,34 @@ static int __gfs2_readpage(void *file, struct page *page)
 * @file: The file to read
 * @page: The page of the file
 *
- * This deals with the locking required. We use a trylock in order to
+ * This deals with the locking required. We have to unlock and
- * avoid the page lock / glock ordering problems returning AOP_TRUNCATED_PAGE
+ * relock the page in order to get the locking in the right
- * in the event that we are unable to get the lock.
+ * order.
 */
 static int gfs2_readpage(struct file *file, struct page *page)
 {
-        struct gfs2_inode *ip = GFS2_I(page->mapping->host);
+        struct address_space *mapping = page->mapping;
-        struct gfs2_holder *gh;
+        struct gfs2_inode *ip = GFS2_I(mapping->host);
+        struct gfs2_holder gh;
        int error;
-        gh = gfs2_glock_is_locked_by_me(ip->i_gl);
+        unlock_page(page);
-        if (!gh) {
+        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
-                gh = kmalloc(sizeof(struct gfs2_holder), GFP_NOFS);
+        error = gfs2_glock_nq_atime(&gh);
-                if (!gh)
+        if (unlikely(error))
-                        return -ENOBUFS;
+                goto out;
-                gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, gh);
+        error = AOP_TRUNCATED_PAGE;
+        lock_page(page);
+        if (page->mapping == mapping && !PageUptodate(page))
+                error = __gfs2_readpage(file, page);
+        else
                unlock_page(page);
-                error = gfs2_glock_nq_atime(gh);
+        gfs2_glock_dq(&gh);
-                if (likely(error != 0))
-                        goto out;
-                return AOP_TRUNCATED_PAGE;
-        }
-        error = __gfs2_readpage(file, page);
-        gfs2_glock_dq(gh);
 out:
-        gfs2_holder_uninit(gh);
+        gfs2_holder_uninit(&gh);
-        kfree(gh);
+        if (error && error != AOP_TRUNCATED_PAGE)
+                lock_page(page);
        return error;
 }
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index e1b7d525a066..e9a366d4411c 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -15,6 +15,7 @@
 #include <linux/uio.h>
 #include <linux/blkdev.h>
 #include <linux/mm.h>
+#include <linux/mount.h>
 #include <linux/fs.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/ext2_fs.h>
@@ -62,11 +63,11 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin)
                error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
                                           &i_gh);
                if (!error) {
-                        error = remote_llseek(file, offset, origin);
+                        error = generic_file_llseek_unlocked(file, offset, origin);
                        gfs2_glock_dq_uninit(&i_gh);
                }
        } else
-                error = remote_llseek(file, offset, origin);
+                error = generic_file_llseek_unlocked(file, offset, origin);
        return error;
 }
@@ -133,7 +134,6 @@ static const u32 fsflags_to_gfs2[32] = {
        [7] = GFS2_DIF_NOATIME,
        [12] = GFS2_DIF_EXHASH,
        [14] = GFS2_DIF_INHERIT_JDATA,
-        [20] = GFS2_DIF_INHERIT_DIRECTIO,
 };
 static const u32 gfs2_to_fsflags[32] = {
@@ -142,7 +142,6 @@ static const u32 gfs2_to_fsflags[32] = {
        [gfs2fl_AppendOnly] = FS_APPEND_FL,
        [gfs2fl_NoAtime] = FS_NOATIME_FL,
        [gfs2fl_ExHash] = FS_INDEX_FL,
-        [gfs2fl_InheritDirectio] = FS_DIRECTIO_FL,
        [gfs2fl_InheritJdata] = FS_JOURNAL_DATA_FL,
 };
@@ -160,12 +159,8 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
                return error;
        fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_di.di_flags);
-        if (!S_ISDIR(inode->i_mode)) {
+        if (!S_ISDIR(inode->i_mode) && ip->i_di.di_flags & GFS2_DIF_JDATA)
-                if (ip->i_di.di_flags & GFS2_DIF_JDATA)
+                fsflags |= FS_JOURNAL_DATA_FL;
-                        fsflags |= FS_JOURNAL_DATA_FL;
-                if (ip->i_di.di_flags & GFS2_DIF_DIRECTIO)
-                        fsflags |= FS_DIRECTIO_FL;
-        }
        if (put_user(fsflags, ptr))
                error = -EFAULT;
@@ -194,13 +189,11 @@ void gfs2_set_inode_flags(struct inode *inode)
 /* Flags that can be set by user space */
 #define GFS2_FLAGS_USER_SET (GFS2_DIF_JDATA|                    \
-                             GFS2_DIF_DIRECTIO|                 \
                             GFS2_DIF_IMMUTABLE|                \
                             GFS2_DIF_APPENDONLY|               \
                             GFS2_DIF_NOATIME|                  \
                             GFS2_DIF_SYNC|                     \
                             GFS2_DIF_SYSTEM|                   \
-                             GFS2_DIF_INHERIT_DIRECTIO|         \
                             GFS2_DIF_INHERIT_JDATA)
 /**
@@ -220,10 +213,14 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
        int error;
        u32 new_flags, flags;
-        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+        error = mnt_want_write(filp->f_path.mnt);
        if (error)
                return error;
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+        if (error)
+                goto out_drop_write;
        flags = ip->i_di.di_flags;
        new_flags = (flags & ~mask) | (reqflags & mask);
        if ((new_flags ^ flags) == 0)
@@ -242,7 +239,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
            !capable(CAP_LINUX_IMMUTABLE))
                goto out;
        if (!IS_IMMUTABLE(inode)) {
-                error = permission(inode, MAY_WRITE, NULL);
+                error = gfs2_permission(inode, MAY_WRITE);
                if (error)
                        goto out;
        }
@@ -272,6 +269,8 @@ out_trans_end:
        gfs2_trans_end(sdp);
 out:
        gfs2_glock_dq_uninit(&gh);
+out_drop_write:
+        mnt_drop_write(filp->f_path.mnt);
        return error;
 }
@@ -285,8 +284,6 @@ static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
        if (!S_ISDIR(inode->i_mode)) {
                if (gfsflags & GFS2_DIF_INHERIT_JDATA)
                        gfsflags ^= (GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA);
-                if (gfsflags & GFS2_DIF_INHERIT_DIRECTIO)
-                        gfsflags ^= (GFS2_DIF_DIRECTIO | GFS2_DIF_INHERIT_DIRECTIO);
                return do_gfs2_set_flags(filp, gfsflags, ~0);
        }
        return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_JDATA);
@@ -487,11 +484,6 @@ static int gfs2_open(struct inode *inode, struct file *file)
                        goto fail_gunlock;
                }
-                /* Listen to the Direct I/O flag */
-                if (ip->i_di.di_flags & GFS2_DIF_DIRECTIO)
-                        file->f_flags |= O_DIRECT;
                gfs2_glock_dq_uninit(&i_gh);
        }
@@ -669,8 +661,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
        int error = 0;
        state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
-        flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE 
+        flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE;
-                | GL_FLOCK;
        mutex_lock(&fp->f_fl_mutex);
@@ -683,9 +674,8 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
                gfs2_glock_dq_wait(fl_gh);
                gfs2_holder_reinit(state, flags, fl_gh);
        } else {
-                error = gfs2_glock_get(GFS2_SB(&ip->i_inode),
+                error = gfs2_glock_get(GFS2_SB(&ip->i_inode), ip->i_no_addr,
-                                      ip->i_no_addr, &gfs2_flock_glops,
+                                       &gfs2_flock_glops, CREATE, &gl);
-                                      CREATE, &gl);
                if (error)
                        goto out;
                gfs2_holder_init(gl, state, flags, fl_gh);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index b2028c82e8d1..b4d1d6490633 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -64,7 +64,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
        mutex_init(&sdp->sd_rindex_mutex);
        INIT_LIST_HEAD(&sdp->sd_rindex_list);
        INIT_LIST_HEAD(&sdp->sd_rindex_mru_list);
-        INIT_LIST_HEAD(&sdp->sd_rindex_recent_list);
        INIT_LIST_HEAD(&sdp->sd_jindex_list);
        spin_lock_init(&sdp->sd_jindex_spin);
@@ -364,6 +363,8 @@ static int map_journal_extents(struct gfs2_sbd *sdp)
 static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
 {
+        if (!sdp->sd_lockstruct.ls_ops->lm_others_may_mount)
+                return;
        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
                sdp->sd_lockstruct.ls_ops->lm_others_may_mount(
                                        sdp->sd_lockstruct.ls_lockspace);
@@ -741,8 +742,7 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
                goto out;
        }
-        if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lockspace) ||
+        if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) ||
-            gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) ||
            gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >=
                                  GFS2_MIN_LVB_SIZE)) {
                gfs2_unmount_lockproto(&sdp->sd_lockstruct);
@@ -873,7 +873,7 @@ fail_sb:
 fail_locking:
        init_locking(sdp, &mount_gh, UNDO);
 fail_lm:
-        gfs2_gl_hash_clear(sdp, WAIT);
+        gfs2_gl_hash_clear(sdp);
        gfs2_lm_unmount(sdp);
        while (invalidate_inodes(sb))
                yield();
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 2686ad4c0029..1e252dfc5294 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -163,7 +163,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
        if (error)
                goto out;
-        error = permission(dir, MAY_WRITE | MAY_EXEC, NULL);
+        error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC);
        if (error)
                goto out_gunlock;
@@ -669,7 +669,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
                        }
                }
        } else {
-                error = permission(ndir, MAY_WRITE | MAY_EXEC, NULL);
+                error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC);
                if (error)
                        goto out_gunlock;
@@ -704,7 +704,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
        /* Check out the dir to be renamed */
        if (dir_rename) {
-                error = permission(odentry->d_inode, MAY_WRITE, NULL);
+                error = gfs2_permission(odentry->d_inode, MAY_WRITE);
                if (error)
                        goto out_gunlock;
        }
@@ -891,7 +891,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
 * Returns: errno
 */
-static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
+int gfs2_permission(struct inode *inode, int mask)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_holder i_gh;
@@ -905,13 +905,22 @@ static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
                unlock = 1;
        }
-        error = generic_permission(inode, mask, gfs2_check_acl);
+        if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
+                error = -EACCES;
+        else
+                error = generic_permission(inode, mask, gfs2_check_acl);
        if (unlock)
                gfs2_glock_dq_uninit(&i_gh);
        return error;
 }
+static int gfs2_iop_permission(struct inode *inode, int mask,
+                               struct nameidata *nd)
+{
+        return gfs2_permission(inode, mask);
+}
 static int setattr_size(struct inode *inode, struct iattr *attr)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
@@ -1141,7 +1150,7 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
 }
 const struct inode_operations gfs2_file_iops = {
-        .permission = gfs2_permission,
+        .permission = gfs2_iop_permission,
        .setattr = gfs2_setattr,
        .getattr = gfs2_getattr,
        .setxattr = gfs2_setxattr,
@@ -1160,7 +1169,7 @@ const struct inode_operations gfs2_dir_iops = {
        .rmdir = gfs2_rmdir,
        .mknod = gfs2_mknod,
        .rename = gfs2_rename,
-        .permission = gfs2_permission,
+        .permission = gfs2_iop_permission,
        .setattr = gfs2_setattr,
        .getattr = gfs2_getattr,
        .setxattr = gfs2_setxattr,
@@ -1172,7 +1181,7 @@ const struct inode_operations gfs2_dir_iops = {
 const struct inode_operations gfs2_symlink_iops = {
        .readlink = gfs2_readlink,
        .follow_link = gfs2_follow_link,
-        .permission = gfs2_permission,
+        .permission = gfs2_iop_permission,
        .setattr = gfs2_setattr,
        .getattr = gfs2_getattr,
        .setxattr = gfs2_setxattr,
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 0b7cc920eb89..f66ea0f7a356 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -126,7 +126,7 @@ static void gfs2_put_super(struct super_block *sb)
        gfs2_clear_rgrpd(sdp);
        gfs2_jindex_free(sdp);
        /*  Take apart glock structures and buffer lists  */
-        gfs2_gl_hash_clear(sdp, WAIT);
+        gfs2_gl_hash_clear(sdp);
        /*  Unmount the locking protocol  */
        gfs2_lm_unmount(sdp);
@@ -155,7 +155,7 @@ static void gfs2_write_super(struct super_block *sb)
 static int gfs2_sync_fs(struct super_block *sb, int wait)
 {
        sb->s_dirt = 0;
-        if (wait)
+        if (wait && sb->s_fs_info)
                gfs2_log_flush(sb->s_fs_info, NULL);
        return 0;
 }
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 56aaf915c59a..3e073f5144fa 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -904,7 +904,7 @@ static int need_sync(struct gfs2_quota_data *qd)
                do_sync = 0;
        else {
                value *= gfs2_jindex_size(sdp) * num;
-                do_div(value, den);
+                value = div_s64(value, den);
                value += (s64)be64_to_cpu(qd->qd_qb.qb_value);
                if (value < (s64)be64_to_cpu(qd->qd_qb.qb_limit))
                        do_sync = 0;
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 2888e4b4b1c5..d5e91f4f6a0b 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -428,6 +428,9 @@ static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *hea
 static void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
                                  unsigned int message)
 {
+        if (!sdp->sd_lockstruct.ls_ops->lm_recovery_done)
+                return;
        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
                sdp->sd_lockstruct.ls_ops->lm_recovery_done(
                        sdp->sd_lockstruct.ls_lockspace, jid, message);
@@ -505,7 +508,7 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd)
                error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
                                           LM_FLAG_NOEXP | LM_FLAG_PRIORITY |
-                                           GL_NOCANCEL | GL_NOCACHE, &t_gh);
+                                           GL_NOCACHE, &t_gh);
                if (error)
                        goto fail_gunlock_ji;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 3401628d742b..2d90fb253505 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -371,11 +371,6 @@ static void clear_rgrpdi(struct gfs2_sbd *sdp)
        spin_lock(&sdp->sd_rindex_spin);
        sdp->sd_rindex_forward = NULL;
-        head = &sdp->sd_rindex_recent_list;
-        while (!list_empty(head)) {
-                rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent);
-                list_del(&rgd->rd_recent);
-        }
        spin_unlock(&sdp->sd_rindex_spin);
        head = &sdp->sd_rindex_list;
@@ -945,107 +940,30 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked)
 }
 /**
- * recent_rgrp_first - get first RG from "recent" list
- * @sdp: The GFS2 superblock
- * @rglast: address of the rgrp used last
- *
- * Returns: The first rgrp in the recent list
- */
-static struct gfs2_rgrpd *recent_rgrp_first(struct gfs2_sbd *sdp,
-                                            u64 rglast)
-{
-        struct gfs2_rgrpd *rgd;
-        spin_lock(&sdp->sd_rindex_spin);
-        if (rglast) {
-                list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
-                        if (rgrp_contains_block(rgd, rglast))
-                                goto out;
-                }
-        }
-        rgd = NULL;
-        if (!list_empty(&sdp->sd_rindex_recent_list))
-                rgd = list_entry(sdp->sd_rindex_recent_list.next,
-                                 struct gfs2_rgrpd, rd_recent);
-out:
-        spin_unlock(&sdp->sd_rindex_spin);
-        return rgd;
-}
-/**
 * recent_rgrp_next - get next RG from "recent" list
 * @cur_rgd: current rgrp
- * @remove:
 *
 * Returns: The next rgrp in the recent list
 */
-static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd,
+static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd)
-                                           int remove)
 {
        struct gfs2_sbd *sdp = cur_rgd->rd_sbd;
        struct list_head *head;
        struct gfs2_rgrpd *rgd;
        spin_lock(&sdp->sd_rindex_spin);
+        head = &sdp->sd_rindex_mru_list;
-        head = &sdp->sd_rindex_recent_list;
+        if (unlikely(cur_rgd->rd_list_mru.next == head)) {
+                spin_unlock(&sdp->sd_rindex_spin);
-        list_for_each_entry(rgd, head, rd_recent) {
+                return NULL;
-                if (rgd == cur_rgd) {
-                        if (cur_rgd->rd_recent.next != head)
-                                rgd = list_entry(cur_rgd->rd_recent.next,
-                                                 struct gfs2_rgrpd, rd_recent);
-                        else
-                                rgd = NULL;
-                        if (remove)
-                                list_del(&cur_rgd->rd_recent);
-                        goto out;
-                }
        }
+        rgd = list_entry(cur_rgd->rd_list_mru.next, struct gfs2_rgrpd, rd_list_mru);
-        rgd = NULL;
-        if (!list_empty(head))
-                rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent);
-out:
        spin_unlock(&sdp->sd_rindex_spin);
        return rgd;
 }
 /**
- * recent_rgrp_add - add an RG to tail of "recent" list
- * @new_rgd: The rgrp to add
- *
- */
-static void recent_rgrp_add(struct gfs2_rgrpd *new_rgd)
-{
-        struct gfs2_sbd *sdp = new_rgd->rd_sbd;
-        struct gfs2_rgrpd *rgd;
-        unsigned int count = 0;
-        unsigned int max = sdp->sd_rgrps / gfs2_jindex_size(sdp);
-        spin_lock(&sdp->sd_rindex_spin);
-        list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
-                if (rgd == new_rgd)
-                        goto out;
-                if (++count >= max)
-                        goto out;
-        }
-        list_add_tail(&new_rgd->rd_recent, &sdp->sd_rindex_recent_list);
-out:
-        spin_unlock(&sdp->sd_rindex_spin);
-}
-/**
 * forward_rgrp_get - get an rgrp to try next from full list
 * @sdp: The GFS2 superblock
 *
@@ -1112,9 +1030,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
        int loops = 0;
        int error, rg_locked;
-        /* Try recently successful rgrps */
+        rgd = gfs2_blk2rgrpd(sdp, ip->i_goal);
-        rgd = recent_rgrp_first(sdp, ip->i_goal);
        while (rgd) {
                rg_locked = 0;
@@ -1136,11 +1052,9 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
                                gfs2_glock_dq_uninit(&al->al_rgd_gh);
                        if (inode)
                                return inode;
-                        rgd = recent_rgrp_next(rgd, 1);
+                        /* fall through */
-                        break;
                case GLR_TRYFAILED:
-                        rgd = recent_rgrp_next(rgd, 0);
+                        rgd = recent_rgrp_next(rgd);
                        break;
                default:
@@ -1199,7 +1113,9 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
 out:
        if (begin) {
-                recent_rgrp_add(rgd);
+                spin_lock(&sdp->sd_rindex_spin);
+                list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
+                spin_unlock(&sdp->sd_rindex_spin);
                rgd = gfs2_rgrpd_get_next(rgd);
                if (!rgd)
                        rgd = gfs2_rgrpd_get_first(sdp);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 7aeacbc65f35..63a8a902d9db 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -65,7 +65,6 @@ void gfs2_tune_init(struct gfs2_tune *gt)
        gt->gt_quota_quantum = 60;
        gt->gt_atime_quantum = 3600;
        gt->gt_new_files_jdata = 0;
-        gt->gt_new_files_directio = 0;
        gt->gt_max_readahead = 1 << 18;
        gt->gt_stall_secs = 600;
        gt->gt_complain_secs = 10;
@@ -941,8 +940,7 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp,
        }
        error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_DEFERRED,
-                               LM_FLAG_PRIORITY | GL_NOCACHE,
+                                   GL_NOCACHE, t_gh);
-                               t_gh);
        list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
                error = gfs2_jdesc_check(jd);
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 9ab9fc85ecd0..74846559fc3f 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -110,18 +110,6 @@ static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf,
        return len;
 }
-static ssize_t shrink_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
-{
-        if (!capable(CAP_SYS_ADMIN))
-                return -EACCES;
-        if (simple_strtol(buf, NULL, 0) != 1)
-                return -EINVAL;
-        gfs2_gl_hash_clear(sdp, NO_WAIT);
-        return len;
-}
 static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,
                                size_t len)
 {
@@ -175,7 +163,6 @@ static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store)
 GFS2_ATTR(id,                  0444, id_show,       NULL);
 GFS2_ATTR(fsname,              0444, fsname_show,   NULL);
 GFS2_ATTR(freeze,              0644, freeze_show,   freeze_store);
-GFS2_ATTR(shrink,              0200, NULL,          shrink_store);
 GFS2_ATTR(withdraw,            0644, withdraw_show, withdraw_store);
 GFS2_ATTR(statfs_sync,         0200, NULL,          statfs_sync_store);
 GFS2_ATTR(quota_sync,          0200, NULL,          quota_sync_store);
@@ -186,7 +173,6 @@ static struct attribute *gfs2_attrs[] = {
        &gfs2_attr_id.attr,
        &gfs2_attr_fsname.attr,
        &gfs2_attr_freeze.attr,
-        &gfs2_attr_shrink.attr,
        &gfs2_attr_withdraw.attr,
        &gfs2_attr_statfs_sync.attr,
        &gfs2_attr_quota_sync.attr,
@@ -426,7 +412,6 @@ TUNE_ATTR(max_readahead, 0);
 TUNE_ATTR(complain_secs, 0);
 TUNE_ATTR(statfs_slow, 0);
 TUNE_ATTR(new_files_jdata, 0);
-TUNE_ATTR(new_files_directio, 0);
 TUNE_ATTR(quota_simul_sync, 1);
 TUNE_ATTR(quota_cache_secs, 1);
 TUNE_ATTR(stall_secs, 1);
@@ -455,7 +440,6 @@ static struct attribute *tune_attrs[] = {
        &tune_attr_quotad_secs.attr,
        &tune_attr_quota_scale.attr,
        &tune_attr_new_files_jdata.attr,
-        &tune_attr_new_files_directio.attr,
        NULL,
 };
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 6914598022ce..91389c8aee8a 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -688,7 +688,6 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
        J_ASSERT(transaction->t_state == T_FINISHED);
        J_ASSERT(transaction->t_buffers == NULL);
-        J_ASSERT(transaction->t_sync_datalist == NULL);
        J_ASSERT(transaction->t_forget == NULL);
        J_ASSERT(transaction->t_iobuf_list == NULL);
        J_ASSERT(transaction->t_shadow_list == NULL);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index a2ed72f7ceee..f8b3be873226 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -22,6 +22,8 @@
 #include <linux/pagemap.h>
 #include <linux/jiffies.h>
 #include <linux/crc32.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
 /*
 * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -37,8 +39,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 }
 /*
- * When an ext3-ordered file is truncated, it is possible that many pages are
+ * When an ext4 file is truncated, it is possible that some pages are not
- * not sucessfully freed, because they are attached to a committing transaction.
+ * successfully freed, because they are attached to a committing transaction.
 * After the transaction commits, these pages are left on the LRU, with no
 * ->mapping, and with attached buffers.  These pages are trivially reclaimable
 * by the VM, but their apparent absence upsets the VM accounting, and it makes
@@ -80,21 +82,6 @@ nope:
 }
 /*
- * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
- * held.  For ranking reasons we must trylock.  If we lose, schedule away and
- * return 0.  j_list_lock is dropped in this case.
- */
-static int inverted_lock(journal_t *journal, struct buffer_head *bh)
-{
-        if (!jbd_trylock_bh_state(bh)) {
-                spin_unlock(&journal->j_list_lock);
-                schedule();
-                return 0;
-        }
-        return 1;
-}
-/*
 * Done it all: now submit the commit record.  We should have
 * cleaned up our previous buffers by now, so if we are in abort
 * mode we can now just skip the rest of the journal write
@@ -112,6 +99,7 @@ static int journal_submit_commit_record(journal_t *journal,
        struct buffer_head *bh;
        int ret;
        int barrier_done = 0;
+        struct timespec now = current_kernel_time();
        if (is_journal_aborted(journal))
                return 0;
@@ -126,6 +114,8 @@ static int journal_submit_commit_record(journal_t *journal,
        tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
        tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
        tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
+        tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
+        tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
        if (JBD2_HAS_COMPAT_FEATURE(journal,
                                    JBD2_FEATURE_COMPAT_CHECKSUM)) {
@@ -197,159 +187,104 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
 }
 /*
- * Wait for all submitted IO to complete.
+ * write the filemap data using writepage() address_space_operations.
+ * We don't do block allocation here even for delalloc. We don't
+ * use writepages() because with dealyed allocation we may be doing
+ * block allocation in writepages().
 */
-static int journal_wait_on_locked_list(journal_t *journal,
+static int journal_submit_inode_data_buffers(struct address_space *mapping)
-                                       transaction_t *commit_transaction)
 {
-        int ret = 0;
+        int ret;
-        struct journal_head *jh;
+        struct writeback_control wbc = {
+                .sync_mode =  WB_SYNC_ALL,
-        while (commit_transaction->t_locked_list) {
+                .nr_to_write = mapping->nrpages * 2,
-                struct buffer_head *bh;
+                .range_start = 0,
+                .range_end = i_size_read(mapping->host),
-                jh = commit_transaction->t_locked_list->b_tprev;
+                .for_writepages = 1,
-                bh = jh2bh(jh);
+        };
-                get_bh(bh);
-                if (buffer_locked(bh)) {
+        ret = generic_writepages(mapping, &wbc);
-                        spin_unlock(&journal->j_list_lock);
-                        wait_on_buffer(bh);
-                        if (unlikely(!buffer_uptodate(bh)))
-                                ret = -EIO;
-                        spin_lock(&journal->j_list_lock);
-                }
-                if (!inverted_lock(journal, bh)) {
-                        put_bh(bh);
-                        spin_lock(&journal->j_list_lock);
-                        continue;
-                }
-                if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
-                        __jbd2_journal_unfile_buffer(jh);
-                        jbd_unlock_bh_state(bh);
-                        jbd2_journal_remove_journal_head(bh);
-                        put_bh(bh);
-                } else {
-                        jbd_unlock_bh_state(bh);
-                }
-                put_bh(bh);
-                cond_resched_lock(&journal->j_list_lock);
-        }
        return ret;
-  }
+}
-static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
+/*
+ * Submit all the data buffers of inode associated with the transaction to
+ * disk.
+ *
+ * We are in a committing transaction. Therefore no new inode can be added to
+ * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
+ * operate on from being released while we write out pages.
+ */
+static int journal_submit_data_buffers(journal_t *journal,
+                transaction_t *commit_transaction)
 {
-        int i;
+        struct jbd2_inode *jinode;
+        int err, ret = 0;
+        struct address_space *mapping;
-        for (i = 0; i < bufs; i++) {
+        spin_lock(&journal->j_list_lock);
-                wbuf[i]->b_end_io = end_buffer_write_sync;
+        list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
-                /* We use-up our safety reference in submit_bh() */
+                mapping = jinode->i_vfs_inode->i_mapping;
-                submit_bh(WRITE, wbuf[i]);
+                jinode->i_flags |= JI_COMMIT_RUNNING;
+                spin_unlock(&journal->j_list_lock);
+                /*
+                 * submit the inode data buffers. We use writepage
+                 * instead of writepages. Because writepages can do
+                 * block allocation  with delalloc. We need to write
+                 * only allocated blocks here.
+                 */
+                err = journal_submit_inode_data_buffers(mapping);
+                if (!ret)
+                        ret = err;
+                spin_lock(&journal->j_list_lock);
+                J_ASSERT(jinode->i_transaction == commit_transaction);
+                jinode->i_flags &= ~JI_COMMIT_RUNNING;
+                wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
        }
+        spin_unlock(&journal->j_list_lock);
+        return ret;
 }
 /*
- *  Submit all the data buffers to disk
+ * Wait for data submitted for writeout, refile inodes to proper
+ * transaction if needed.
+ *
 */
-static void journal_submit_data_buffers(journal_t *journal,
+static int journal_finish_inode_data_buffers(journal_t *journal,
-                                transaction_t *commit_transaction)
+                transaction_t *commit_transaction)
 {
-        struct journal_head *jh;
+        struct jbd2_inode *jinode, *next_i;
-        struct buffer_head *bh;
+        int err, ret = 0;
-        int locked;
-        int bufs = 0;
-        struct buffer_head **wbuf = journal->j_wbuf;
-        /*
+        /* For locking, see the comment in journal_submit_data_buffers() */
-         * Whenever we unlock the journal and sleep, things can get added
-         * onto ->t_sync_datalist, so we have to keep looping back to
-         * write_out_data until we *know* that the list is empty.
-         *
-         * Cleanup any flushed data buffers from the data list.  Even in
-         * abort mode, we want to flush this out as soon as possible.
-         */
-write_out_data:
-        cond_resched();
        spin_lock(&journal->j_list_lock);
+        list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+                jinode->i_flags |= JI_COMMIT_RUNNING;
+                spin_unlock(&journal->j_list_lock);
+                err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
+                if (!ret)
+                        ret = err;
+                spin_lock(&journal->j_list_lock);
+                jinode->i_flags &= ~JI_COMMIT_RUNNING;
+                wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
+        }
-        while (commit_transaction->t_sync_datalist) {
+        /* Now refile inode to proper lists */
-                jh = commit_transaction->t_sync_datalist;
+        list_for_each_entry_safe(jinode, next_i,
-                bh = jh2bh(jh);
+                                 &commit_transaction->t_inode_list, i_list) {
-                locked = 0;
+                list_del(&jinode->i_list);
+                if (jinode->i_next_transaction) {
-                /* Get reference just to make sure buffer does not disappear
+                        jinode->i_transaction = jinode->i_next_transaction;
-                 * when we are forced to drop various locks */
+                        jinode->i_next_transaction = NULL;
-                get_bh(bh);
+                        list_add(&jinode->i_list,
-                /* If the buffer is dirty, we need to submit IO and hence
+                                &jinode->i_transaction->t_inode_list);
-                 * we need the buffer lock. We try to lock the buffer without
-                 * blocking. If we fail, we need to drop j_list_lock and do
-                 * blocking lock_buffer().
-                 */
-                if (buffer_dirty(bh)) {
-                        if (test_set_buffer_locked(bh)) {
-                                BUFFER_TRACE(bh, "needs blocking lock");
-                                spin_unlock(&journal->j_list_lock);
-                                /* Write out all data to prevent deadlocks */
-                                journal_do_submit_data(wbuf, bufs);
-                                bufs = 0;
-                                lock_buffer(bh);
-                                spin_lock(&journal->j_list_lock);
-                        }
-                        locked = 1;
-                }
-                /* We have to get bh_state lock. Again out of order, sigh. */
-                if (!inverted_lock(journal, bh)) {
-                        jbd_lock_bh_state(bh);
-                        spin_lock(&journal->j_list_lock);
-                }
-                /* Someone already cleaned up the buffer? */
-                if (!buffer_jbd(bh)
-                        || jh->b_transaction != commit_transaction
-                        || jh->b_jlist != BJ_SyncData) {
-                        jbd_unlock_bh_state(bh);
-                        if (locked)
-                                unlock_buffer(bh);
-                        BUFFER_TRACE(bh, "already cleaned up");
-                        put_bh(bh);
-                        continue;
-                }
-                if (locked && test_clear_buffer_dirty(bh)) {
-                        BUFFER_TRACE(bh, "needs writeout, adding to array");
-                        wbuf[bufs++] = bh;
-                        __jbd2_journal_file_buffer(jh, commit_transaction,
-                                                BJ_Locked);
-                        jbd_unlock_bh_state(bh);
-                        if (bufs == journal->j_wbufsize) {
-                                spin_unlock(&journal->j_list_lock);
-                                journal_do_submit_data(wbuf, bufs);
-                                bufs = 0;
-                                goto write_out_data;
-                        }
-                } else if (!locked && buffer_locked(bh)) {
-                        __jbd2_journal_file_buffer(jh, commit_transaction,
-                                                BJ_Locked);
-                        jbd_unlock_bh_state(bh);
-                        put_bh(bh);
                } else {
-                        BUFFER_TRACE(bh, "writeout complete: unfile");
+                        jinode->i_transaction = NULL;
-                        __jbd2_journal_unfile_buffer(jh);
-                        jbd_unlock_bh_state(bh);
-                        if (locked)
-                                unlock_buffer(bh);
-                        jbd2_journal_remove_journal_head(bh);
-                        /* Once for our safety reference, once for
-                         * jbd2_journal_remove_journal_head() */
-                        put_bh(bh);
-                        put_bh(bh);
-                }
-                if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
-                        spin_unlock(&journal->j_list_lock);
-                        goto write_out_data;
                }
        }
        spin_unlock(&journal->j_list_lock);
-        journal_do_submit_data(wbuf, bufs);
+        return ret;
 }
 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
@@ -524,21 +459,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
         * Now start flushing things to disk, in the order they appear
         * on the transaction lists.  Data blocks go first.
         */
-        err = 0;
+        err = journal_submit_data_buffers(journal, commit_transaction);
-        journal_submit_data_buffers(journal, commit_transaction);
-        /*
-         * Wait for all previously submitted IO to complete if commit
-         * record is to be written synchronously.
-         */
-        spin_lock(&journal->j_list_lock);
-        if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
-                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
-                err = journal_wait_on_locked_list(journal,
-                                                commit_transaction);
-        spin_unlock(&journal->j_list_lock);
        if (err)
                jbd2_journal_abort(journal, err);
@@ -547,16 +468,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        jbd_debug(3, "JBD: commit phase 2\n");
        /*
-         * If we found any dirty or locked buffers, then we should have
-         * looped back up to the write_out_data label.  If there weren't
-         * any then journal_clean_data_list should have wiped the list
-         * clean by now, so check that it is in fact empty.
-         */
-        J_ASSERT (commit_transaction->t_sync_datalist == NULL);
-        jbd_debug (3, "JBD: commit phase 3\n");
-        /*
         * Way to go: we have now written out all of the data for a
         * transaction!  Now comes the tricky part: we need to write out
         * metadata.  Loop over the transaction's entire buffer list:
@@ -574,6 +485,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        J_ASSERT(commit_transaction->t_nr_buffers <=
                 commit_transaction->t_outstanding_credits);
+        err = 0;
        descriptor = NULL;
        bufs = 0;
        while (commit_transaction->t_buffers) {
@@ -748,15 +660,19 @@ start_journal_io:
                                                 &cbh, crc32_sum);
                if (err)
                        __jbd2_journal_abort_hard(journal);
-                spin_lock(&journal->j_list_lock);
-                err = journal_wait_on_locked_list(journal,
-                                                commit_transaction);
-                spin_unlock(&journal->j_list_lock);
-                if (err)
-                        __jbd2_journal_abort_hard(journal);
        }
+        /*
+         * This is the right place to wait for data buffers both for ASYNC
+         * and !ASYNC commit. If commit is ASYNC, we need to wait only after
+         * the commit block went to disk (which happens above). If commit is
+         * SYNC, we need to wait for data buffers before we start writing
+         * commit block, which happens below in such setting.
+         */
+        err = journal_finish_inode_data_buffers(journal, commit_transaction);
+        if (err)
+                jbd2_journal_abort(journal, err);
        /* Lo and behold: we have just managed to send a transaction to
           the log.  Before we can commit it, wait for the IO so far to
           complete.  Control buffers being written are on the
@@ -768,7 +684,7 @@ start_journal_io:
           so we incur less scheduling load.
        */
-        jbd_debug(3, "JBD: commit phase 4\n");
+        jbd_debug(3, "JBD: commit phase 3\n");
        /*
         * akpm: these are BJ_IO, and j_list_lock is not needed.
@@ -827,7 +743,7 @@ wait_for_iobuf:
        J_ASSERT (commit_transaction->t_shadow_list == NULL);
-        jbd_debug(3, "JBD: commit phase 5\n");
+        jbd_debug(3, "JBD: commit phase 4\n");
        /* Here we wait for the revoke record and descriptor record buffers */
 wait_for_ctlbuf:
@@ -854,7 +770,7 @@ wait_for_iobuf:
                /* AKPM: bforget here */
        }
-        jbd_debug(3, "JBD: commit phase 6\n");
+        jbd_debug(3, "JBD: commit phase 5\n");
        if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
@@ -874,9 +790,9 @@ wait_for_iobuf:
           transaction can be removed from any checkpoint list it was on
           before. */
-        jbd_debug(3, "JBD: commit phase 7\n");
+        jbd_debug(3, "JBD: commit phase 6\n");
-        J_ASSERT(commit_transaction->t_sync_datalist == NULL);
+        J_ASSERT(list_empty(&commit_transaction->t_inode_list));
        J_ASSERT(commit_transaction->t_buffers == NULL);
        J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
        J_ASSERT(commit_transaction->t_iobuf_list == NULL);
@@ -997,7 +913,7 @@ restart_loop:
        /* Done with this transaction! */
-        jbd_debug(3, "JBD: commit phase 8\n");
+        jbd_debug(3, "JBD: commit phase 7\n");
        J_ASSERT(commit_transaction->t_state == T_COMMIT);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 2e24567c4a79..b26c6d9fe6ae 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -50,7 +50,6 @@ EXPORT_SYMBOL(jbd2_journal_unlock_updates);
 EXPORT_SYMBOL(jbd2_journal_get_write_access);
 EXPORT_SYMBOL(jbd2_journal_get_create_access);
 EXPORT_SYMBOL(jbd2_journal_get_undo_access);
-EXPORT_SYMBOL(jbd2_journal_dirty_data);
 EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
 EXPORT_SYMBOL(jbd2_journal_release_buffer);
 EXPORT_SYMBOL(jbd2_journal_forget);
@@ -82,6 +81,10 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
 EXPORT_SYMBOL(jbd2_journal_invalidatepage);
 EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
 EXPORT_SYMBOL(jbd2_journal_force_commit);
+EXPORT_SYMBOL(jbd2_journal_file_inode);
+EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
+EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
+EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
 static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
 static void __journal_abort_soft (journal_t *journal, int errno);
@@ -2195,6 +2198,54 @@ void jbd2_journal_put_journal_head(struct journal_head *jh)
 }
 /*
+ * Initialize jbd inode head
+ */
+void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode)
+{
+        jinode->i_transaction = NULL;
+        jinode->i_next_transaction = NULL;
+        jinode->i_vfs_inode = inode;
+        jinode->i_flags = 0;
+        INIT_LIST_HEAD(&jinode->i_list);
+}
+/*
+ * Function to be called before we start removing inode from memory (i.e.,
+ * clear_inode() is a fine place to be called from). It removes inode from
+ * transaction's lists.
+ */
+void jbd2_journal_release_jbd_inode(journal_t *journal,
+                                    struct jbd2_inode *jinode)
+{
+        int writeout = 0;
+        if (!journal)
+                return;
+restart:
+        spin_lock(&journal->j_list_lock);
+        /* Is commit writing out inode - we have to wait */
+        if (jinode->i_flags & JI_COMMIT_RUNNING) {
+                wait_queue_head_t *wq;
+                DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
+                wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
+                prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+                spin_unlock(&journal->j_list_lock);
+                schedule();
+                finish_wait(wq, &wait.wait);
+                goto restart;
+        }
+        /* Do we need to wait for data writeback? */
+        if (journal->j_committing_transaction == jinode->i_transaction)
+                writeout = 1;
+        if (jinode->i_transaction) {
+                list_del(&jinode->i_list);
+                jinode->i_transaction = NULL;
+        }
+        spin_unlock(&journal->j_list_lock);
+}
+/*
 * debugfs tunables
 */
 #ifdef CONFIG_JBD2_DEBUG
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index d6e006e67804..4f7cadbb19fa 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -41,7 +41,6 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
 *      new transaction and we can't block without protecting against other
 *      processes trying to touch the journal while it is in transition.
 *
- * Called under j_state_lock
 */
 static transaction_t *
@@ -52,6 +51,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
        transaction->t_tid = journal->j_transaction_sequence++;
        transaction->t_expires = jiffies + journal->j_commit_interval;
        spin_lock_init(&transaction->t_handle_lock);
+        INIT_LIST_HEAD(&transaction->t_inode_list);
        /* Set up the commit timer for the new transaction. */
        journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
@@ -943,183 +943,6 @@ out:
 }
 /**
- * int jbd2_journal_dirty_data() -  mark a buffer as containing dirty data which
- *                             needs to be flushed before we can commit the
- *                             current transaction.
- * @handle: transaction
- * @bh: bufferhead to mark
- *
- * The buffer is placed on the transaction's data list and is marked as
- * belonging to the transaction.
- *
- * Returns error number or 0 on success.
- *
- * jbd2_journal_dirty_data() can be called via page_launder->ext3_writepage
- * by kswapd.
- */
-int jbd2_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
-{
-        journal_t *journal = handle->h_transaction->t_journal;
-        int need_brelse = 0;
-        struct journal_head *jh;
-        if (is_handle_aborted(handle))
-                return 0;
-        jh = jbd2_journal_add_journal_head(bh);
-        JBUFFER_TRACE(jh, "entry");
-        /*
-         * The buffer could *already* be dirty.  Writeout can start
-         * at any time.
-         */
-        jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid);
-        /*
-         * What if the buffer is already part of a running transaction?
-         *
-         * There are two cases:
-         * 1) It is part of the current running transaction.  Refile it,
-         *    just in case we have allocated it as metadata, deallocated
-         *    it, then reallocated it as data.
-         * 2) It is part of the previous, still-committing transaction.
-         *    If all we want to do is to guarantee that the buffer will be
-         *    written to disk before this new transaction commits, then
-         *    being sure that the *previous* transaction has this same
-         *    property is sufficient for us!  Just leave it on its old
-         *    transaction.
-         *
-         * In case (2), the buffer must not already exist as metadata
-         * --- that would violate write ordering (a transaction is free
-         * to write its data at any point, even before the previous
-         * committing transaction has committed).  The caller must
-         * never, ever allow this to happen: there's nothing we can do
-         * about it in this layer.
-         */
-        jbd_lock_bh_state(bh);
-        spin_lock(&journal->j_list_lock);
-        /* Now that we have bh_state locked, are we really still mapped? */
-        if (!buffer_mapped(bh)) {
-                JBUFFER_TRACE(jh, "unmapped buffer, bailing out");
-                goto no_journal;
-        }
-        if (jh->b_transaction) {
-                JBUFFER_TRACE(jh, "has transaction");
-                if (jh->b_transaction != handle->h_transaction) {
-                        JBUFFER_TRACE(jh, "belongs to older transaction");
-                        J_ASSERT_JH(jh, jh->b_transaction ==
-                                        journal->j_committing_transaction);
-                        /* @@@ IS THIS TRUE  ? */
-                        /*
-                         * Not any more.  Scenario: someone does a write()
-                         * in data=journal mode.  The buffer's transaction has
-                         * moved into commit.  Then someone does another
-                         * write() to the file.  We do the frozen data copyout
-                         * and set b_next_transaction to point to j_running_t.
-                         * And while we're in that state, someone does a
-                         * writepage() in an attempt to pageout the same area
-                         * of the file via a shared mapping.  At present that
-                         * calls jbd2_journal_dirty_data(), and we get right here.
-                         * It may be too late to journal the data.  Simply
-                         * falling through to the next test will suffice: the
-                         * data will be dirty and wil be checkpointed.  The
-                         * ordering comments in the next comment block still
-                         * apply.
-                         */
-                        //J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
-                        /*
-                         * If we're journalling data, and this buffer was
-                         * subject to a write(), it could be metadata, forget
-                         * or shadow against the committing transaction.  Now,
-                         * someone has dirtied the same darn page via a mapping
-                         * and it is being writepage()'d.
-                         * We *could* just steal the page from commit, with some
-                         * fancy locking there.  Instead, we just skip it -
-                         * don't tie the page's buffers to the new transaction
-                         * at all.
-                         * Implication: if we crash before the writepage() data
-                         * is written into the filesystem, recovery will replay
-                         * the write() data.
-                         */
-                        if (jh->b_jlist != BJ_None &&
-                                        jh->b_jlist != BJ_SyncData &&
-                                        jh->b_jlist != BJ_Locked) {
-                                JBUFFER_TRACE(jh, "Not stealing");
-                                goto no_journal;
-                        }
-                        /*
-                         * This buffer may be undergoing writeout in commit.  We
-                         * can't return from here and let the caller dirty it
-                         * again because that can cause the write-out loop in
-                         * commit to never terminate.
-                         */
-                        if (buffer_dirty(bh)) {
-                                get_bh(bh);
-                                spin_unlock(&journal->j_list_lock);
-                                jbd_unlock_bh_state(bh);
-                                need_brelse = 1;
-                                sync_dirty_buffer(bh);
-                                jbd_lock_bh_state(bh);
-                                spin_lock(&journal->j_list_lock);
-                                /* Since we dropped the lock... */
-                                if (!buffer_mapped(bh)) {
-                                        JBUFFER_TRACE(jh, "buffer got unmapped");
-                                        goto no_journal;
-                                }
-                                /* The buffer may become locked again at any
-                                   time if it is redirtied */
-                        }
-                        /* journal_clean_data_list() may have got there first */
-                        if (jh->b_transaction != NULL) {
-                                JBUFFER_TRACE(jh, "unfile from commit");
-                                __jbd2_journal_temp_unlink_buffer(jh);
-                                /* It still points to the committing
-                                 * transaction; move it to this one so
-                                 * that the refile assert checks are
-                                 * happy. */
-                                jh->b_transaction = handle->h_transaction;
-                        }
-                        /* The buffer will be refiled below */
-                }
-                /*
-                 * Special case --- the buffer might actually have been
-                 * allocated and then immediately deallocated in the previous,
-                 * committing transaction, so might still be left on that
-                 * transaction's metadata lists.
-                 */
-                if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
-                        JBUFFER_TRACE(jh, "not on correct data list: unfile");
-                        J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
-                        __jbd2_journal_temp_unlink_buffer(jh);
-                        jh->b_transaction = handle->h_transaction;
-                        JBUFFER_TRACE(jh, "file as data");
-                        __jbd2_journal_file_buffer(jh, handle->h_transaction,
-                                                BJ_SyncData);
-                }
-        } else {
-                JBUFFER_TRACE(jh, "not on a transaction");
-                __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_SyncData);
-        }
-no_journal:
-        spin_unlock(&journal->j_list_lock);
-        jbd_unlock_bh_state(bh);
-        if (need_brelse) {
-                BUFFER_TRACE(bh, "brelse");
-                __brelse(bh);
-        }
-        JBUFFER_TRACE(jh, "exit");
-        jbd2_journal_put_journal_head(jh);
-        return 0;
-}
-/**
 * int jbd2_journal_dirty_metadata() -  mark a buffer as containing dirty metadata
 * @handle: transaction to add buffer to.
 * @bh: buffer to mark
@@ -1541,10 +1364,10 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
 * Remove a buffer from the appropriate transaction list.
 *
 * Note that this function can *change* the value of
- * bh->b_transaction->t_sync_datalist, t_buffers, t_forget,
+ * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list,
- * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list.  If the caller
+ * t_log_list or t_reserved_list.  If the caller is holding onto a copy of one
- * is holding onto a copy of one of thee pointers, it could go bad.
+ * of these pointers, it could go bad.  Generally the caller needs to re-read
- * Generally the caller needs to re-read the pointer from the transaction_t.
+ * the pointer from the transaction_t.
 *
 * Called under j_list_lock.  The journal may not be locked.
 */
@@ -1566,9 +1389,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
        switch (jh->b_jlist) {
        case BJ_None:
                return;
-        case BJ_SyncData:
-                list = &transaction->t_sync_datalist;
-                break;
        case BJ_Metadata:
                transaction->t_nr_buffers--;
                J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
@@ -1589,9 +1409,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
        case BJ_Reserved:
                list = &transaction->t_reserved_list;
                break;
-        case BJ_Locked:
-                list = &transaction->t_locked_list;
-                break;
        }
        __blist_del_buffer(list, jh);
@@ -1634,15 +1451,7 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
                goto out;
        spin_lock(&journal->j_list_lock);
-        if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) {
+        if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
-                if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
-                        /* A written-back ordered data buffer */
-                        JBUFFER_TRACE(jh, "release data");
-                        __jbd2_journal_unfile_buffer(jh);
-                        jbd2_journal_remove_journal_head(bh);
-                        __brelse(bh);
-                }
-        } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
                /* written-back checkpointed metadata buffer */
                if (jh->b_jlist == BJ_None) {
                        JBUFFER_TRACE(jh, "remove from checkpoint list");
@@ -1656,12 +1465,43 @@ out:
        return;
 }
+/*
+ * jbd2_journal_try_to_free_buffers() could race with
+ * jbd2_journal_commit_transaction(). The later might still hold the
+ * reference count to the buffers when inspecting them on
+ * t_syncdata_list or t_locked_list.
+ *
+ * jbd2_journal_try_to_free_buffers() will call this function to
+ * wait for the current transaction to finish syncing data buffers, before
+ * try to free that buffer.
+ *
+ * Called with journal->j_state_lock hold.
+ */
+static void jbd2_journal_wait_for_transaction_sync_data(journal_t *journal)
+{
+        transaction_t *transaction;
+        tid_t tid;
+        spin_lock(&journal->j_state_lock);
+        transaction = journal->j_committing_transaction;
+        if (!transaction) {
+                spin_unlock(&journal->j_state_lock);
+                return;
+        }
+        tid = transaction->t_tid;
+        spin_unlock(&journal->j_state_lock);
+        jbd2_log_wait_commit(journal, tid);
+}
 /**
 * int jbd2_journal_try_to_free_buffers() - try to free page buffers.
 * @journal: journal for operation
 * @page: to try and free
- * @unused_gfp_mask: unused
+ * @gfp_mask: we use the mask to detect how hard should we try to release
+ * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
+ * release the buffers.
 *
 *
 * For all the buffers on this page,
@@ -1690,9 +1530,11 @@ out:
 * journal_try_to_free_buffer() is changing its state.  But that
 * cannot happen because we never reallocate freed data as metadata
 * while the data is part of a transaction.  Yes?
+ *
+ * Return 0 on failure, 1 on success
 */
 int jbd2_journal_try_to_free_buffers(journal_t *journal,
-                                struct page *page, gfp_t unused_gfp_mask)
+                                struct page *page, gfp_t gfp_mask)
 {
        struct buffer_head *head;
        struct buffer_head *bh;
@@ -1708,7 +1550,8 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
                /*
                 * We take our own ref against the journal_head here to avoid
                 * having to add tons of locking around each instance of
-                 * jbd2_journal_remove_journal_head() and jbd2_journal_put_journal_head().
+                 * jbd2_journal_remove_journal_head() and
+                 * jbd2_journal_put_journal_head().
                 */
                jh = jbd2_journal_grab_journal_head(bh);
                if (!jh)
@@ -1721,7 +1564,28 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
                if (buffer_jbd(bh))
                        goto busy;
        } while ((bh = bh->b_this_page) != head);
        ret = try_to_free_buffers(page);
+        /*
+         * There are a number of places where jbd2_journal_try_to_free_buffers()
+         * could race with jbd2_journal_commit_transaction(), the later still
+         * holds the reference to the buffers to free while processing them.
+         * try_to_free_buffers() failed to free those buffers. Some of the
+         * caller of releasepage() request page buffers to be dropped, otherwise
+         * treat the fail-to-free as errors (such as generic_file_direct_IO())
+         *
+         * So, if the caller of try_to_release_page() wants the synchronous
+         * behaviour(i.e make sure buffers are dropped upon return),
+         * let's wait for the current transaction to finish flush of
+         * dirty data buffers, then try to free those buffers again,
+         * with the journal locked.
+         */
+        if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) {
+                jbd2_journal_wait_for_transaction_sync_data(journal);
+                ret = try_to_free_buffers(page);
+        }
 busy:
        return ret;
 }
@@ -1823,6 +1687,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
        if (!buffer_jbd(bh))
                goto zap_buffer_unlocked;
+        /* OK, we have data buffer in journaled mode */
        spin_lock(&journal->j_state_lock);
        jbd_lock_bh_state(bh);
        spin_lock(&journal->j_list_lock);
@@ -1886,15 +1751,6 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
                }
        } else if (transaction == journal->j_committing_transaction) {
                JBUFFER_TRACE(jh, "on committing transaction");
-                if (jh->b_jlist == BJ_Locked) {
-                        /*
-                         * The buffer is on the committing transaction's locked
-                         * list.  We have the buffer locked, so I/O has
-                         * completed.  So we can nail the buffer now.
-                         */
-                        may_free = __dispose_buffer(jh, transaction);
-                        goto zap_buffer;
-                }
                /*
                 * If it is committing, we simply cannot touch it.  We
                 * can remove it's next_transaction pointer from the
@@ -2027,9 +1883,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
                J_ASSERT_JH(jh, !jh->b_committed_data);
                J_ASSERT_JH(jh, !jh->b_frozen_data);
                return;
-        case BJ_SyncData:
-                list = &transaction->t_sync_datalist;
-                break;
        case BJ_Metadata:
                transaction->t_nr_buffers++;
                list = &transaction->t_buffers;
@@ -2049,9 +1902,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
        case BJ_Reserved:
                list = &transaction->t_reserved_list;
                break;
-        case BJ_Locked:
-                list =  &transaction->t_locked_list;
-                break;
        }
        __blist_add_buffer(list, jh);
@@ -2141,3 +1991,88 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
        spin_unlock(&journal->j_list_lock);
        __brelse(bh);
 }
+/*
+ * File inode in the inode list of the handle's transaction
+ */
+int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
+{
+        transaction_t *transaction = handle->h_transaction;
+        journal_t *journal = transaction->t_journal;
+        if (is_handle_aborted(handle))
+                return -EIO;
+        jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
+                        transaction->t_tid);
+        /*
+         * First check whether inode isn't already on the transaction's
+         * lists without taking the lock. Note that this check is safe
+         * without the lock as we cannot race with somebody removing inode
+         * from the transaction. The reason is that we remove inode from the
+         * transaction only in journal_release_jbd_inode() and when we commit
+         * the transaction. We are guarded from the first case by holding
+         * a reference to the inode. We are safe against the second case
+         * because if jinode->i_transaction == transaction, commit code
+         * cannot touch the transaction because we hold reference to it,
+         * and if jinode->i_next_transaction == transaction, commit code
+         * will only file the inode where we want it.
+         */
+        if (jinode->i_transaction == transaction ||
+            jinode->i_next_transaction == transaction)
+                return 0;
+        spin_lock(&journal->j_list_lock);
+        if (jinode->i_transaction == transaction ||
+            jinode->i_next_transaction == transaction)
+                goto done;
+        /* On some different transaction's list - should be
+         * the committing one */
+        if (jinode->i_transaction) {
+                J_ASSERT(jinode->i_next_transaction == NULL);
+                J_ASSERT(jinode->i_transaction ==
+                                        journal->j_committing_transaction);
+                jinode->i_next_transaction = transaction;
+                goto done;
+        }
+        /* Not on any transaction list... */
+        J_ASSERT(!jinode->i_next_transaction);
+        jinode->i_transaction = transaction;
+        list_add(&jinode->i_list, &transaction->t_inode_list);
+done:
+        spin_unlock(&journal->j_list_lock);
+        return 0;
+}
+/*
+ * This function must be called when inode is journaled in ordered mode
+ * before truncation happens. It starts writeout of truncated part in
+ * case it is in the committing transaction so that we stand to ordered
+ * mode consistency guarantees.
+ */
+int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
+                                        loff_t new_size)
+{
+        journal_t *journal;
+        transaction_t *commit_trans;
+        int ret = 0;
+        if (!inode->i_transaction && !inode->i_next_transaction)
+                goto out;
+        journal = inode->i_transaction->t_journal;
+        spin_lock(&journal->j_state_lock);
+        commit_trans = journal->j_committing_transaction;
+        spin_unlock(&journal->j_state_lock);
+        if (inode->i_transaction == commit_trans) {
+                ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping,
+                        new_size, LLONG_MAX);
+                if (ret)
+                        jbd2_journal_abort(journal, ret);
+        }
+out:
+        return ret;
+}
diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c
index bf6ab19b86ee..6a73de84bcef 100644
--- a/fs/jfs/jfs_debug.c
+++ b/fs/jfs/jfs_debug.c
@@ -21,6 +21,7 @@
 #include <linux/ctype.h>
 #include <linux/module.h>
 #include <linux/proc_fs.h>
+#include <linux/seq_file.h>
 #include <asm/uaccess.h>
 #include "jfs_incore.h"
 #include "jfs_filsys.h"
@@ -30,29 +31,19 @@
 static struct proc_dir_entry *base;
 #ifdef CONFIG_JFS_DEBUG
-static int loglevel_read(char *page, char **start, off_t off,
+static int jfs_loglevel_proc_show(struct seq_file *m, void *v)
-                         int count, int *eof, void *data)
 {
-        int len;
+        seq_printf(m, "%d\n", jfsloglevel);
+        return 0;
-        len = sprintf(page, "%d\n", jfsloglevel);
+}
-        len -= off;
-        *start = page + off;
-        if (len > count)
-                len = count;
-        else
-                *eof = 1;
-        if (len < 0)
-                len = 0;
-        return len;
+static int jfs_loglevel_proc_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, jfs_loglevel_proc_show, NULL);
 }
-static int loglevel_write(struct file *file, const char __user *buffer,
+static ssize_t jfs_loglevel_proc_write(struct file *file,
-                        unsigned long count, void *data)
+                const char __user *buffer, size_t count, loff_t *ppos)
 {
        char c;
@@ -65,22 +56,30 @@ static int loglevel_write(struct file *file, const char __user *buffer,
        jfsloglevel = c - '0';
        return count;
 }
+static const struct file_operations jfs_loglevel_proc_fops = {
+        .owner          = THIS_MODULE,
+        .open           = jfs_loglevel_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+        .write          = jfs_loglevel_proc_write,
+};
 #endif
 static struct {
        const char      *name;
-        read_proc_t     *read_fn;
+        const struct file_operations *proc_fops;
-        write_proc_t    *write_fn;
 } Entries[] = {
 #ifdef CONFIG_JFS_STATISTICS
-        { "lmstats",    jfs_lmstats_read, },
+        { "lmstats",    &jfs_lmstats_proc_fops, },
-        { "txstats",    jfs_txstats_read, },
+        { "txstats",    &jfs_txstats_proc_fops, },
-        { "xtstat",     jfs_xtstat_read, },
+        { "xtstat",     &jfs_xtstat_proc_fops, },
-        { "mpstat",     jfs_mpstat_read, },
+        { "mpstat",     &jfs_mpstat_proc_fops, },
 #endif
 #ifdef CONFIG_JFS_DEBUG
-        { "TxAnchor",   jfs_txanchor_read, },
+        { "TxAnchor",   &jfs_txanchor_proc_fops, },
-        { "loglevel",   loglevel_read, loglevel_write }
+        { "loglevel",   &jfs_loglevel_proc_fops }
 #endif
 };
 #define NPROCENT        ARRAY_SIZE(Entries)
@@ -93,13 +92,8 @@ void jfs_proc_init(void)
                return;
        base->owner = THIS_MODULE;
-        for (i = 0; i < NPROCENT; i++) {
+        for (i = 0; i < NPROCENT; i++)
-                struct proc_dir_entry *p;
+                proc_create(Entries[i].name, 0, base, Entries[i].proc_fops);
-                if ((p = create_proc_entry(Entries[i].name, 0, base))) {
-                        p->read_proc = Entries[i].read_fn;
-                        p->write_proc = Entries[i].write_fn;
-                }
-        }
 }
 void jfs_proc_clean(void)
diff --git a/fs/jfs/jfs_debug.h b/fs/jfs/jfs_debug.h
index 044c1e654cc0..eafd1300a00b 100644
--- a/fs/jfs/jfs_debug.h
+++ b/fs/jfs/jfs_debug.h
@@ -62,7 +62,7 @@ extern void jfs_proc_clean(void);
 extern int jfsloglevel;
-extern int jfs_txanchor_read(char *, char **, off_t, int, int *, void *);
+extern const struct file_operations jfs_txanchor_proc_fops;
 /* information message: e.g., configuration, major event */
 #define jfs_info(fmt, arg...) do {                      \
@@ -105,10 +105,10 @@ extern int jfs_txanchor_read(char *, char **, off_t, int, int *, void *);
 *      ----------
 */
 #ifdef  CONFIG_JFS_STATISTICS
-extern int jfs_lmstats_read(char *, char **, off_t, int, int *, void *);
+extern const struct file_operations jfs_lmstats_proc_fops;
-extern int jfs_txstats_read(char *, char **, off_t, int, int *, void *);
+extern const struct file_operations jfs_txstats_proc_fops;
-extern int jfs_mpstat_read(char *, char **, off_t, int, int *, void *);
+extern const struct file_operations jfs_mpstat_proc_fops;
-extern int jfs_xtstat_read(char *, char **, off_t, int, int *, void *);
+extern const struct file_operations jfs_xtstat_proc_fops;
 #define INCREMENT(x)            ((x)++)
 #define DECREMENT(x)            ((x)--)
diff --git a/fs/jfs/jfs_dtree.h b/fs/jfs/jfs_dtree.h
index cdac2d5bafeb..2545bb317235 100644
--- a/fs/jfs/jfs_dtree.h
+++ b/fs/jfs/jfs_dtree.h
@@ -243,9 +243,6 @@ typedef union {
 #define JFS_REMOVE 3
 #define JFS_RENAME 4
-#define DIRENTSIZ(namlen) \
-    ( (sizeof(struct dirent) - 2*(JFS_NAME_MAX+1) + 2*((namlen)+1) + 3) &~ 3 )
 /*
 * Maximum file offset for directories.
 */
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 734ec916beaf..d6363d8309d0 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -1520,7 +1520,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip)
                                        jfs_error(ip->i_sb,
                                                  "diAlloc: can't find free bit "
                                                  "in wmap");
-                                        return EIO;
+                                        return -EIO;
                                }
                                /* determine the inode number within the
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 325a9679b95a..cd2ec2988b59 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -69,6 +69,7 @@
 #include <linux/freezer.h>
 #include <linux/delay.h>
 #include <linux/mutex.h>
+#include <linux/seq_file.h>
 #include "jfs_incore.h"
 #include "jfs_filsys.h"
 #include "jfs_metapage.h"
@@ -2503,13 +2504,9 @@ exit:
 }
 #ifdef CONFIG_JFS_STATISTICS
-int jfs_lmstats_read(char *buffer, char **start, off_t offset, int length,
+static int jfs_lmstats_proc_show(struct seq_file *m, void *v)
-                      int *eof, void *data)
 {
-        int len = 0;
+        seq_printf(m,
-        off_t begin;
-        len += sprintf(buffer,
                       "JFS Logmgr stats\n"
                       "================\n"
                       "commits = %d\n"
@@ -2522,19 +2519,19 @@ int jfs_lmstats_read(char *buffer, char **start, off_t offset, int length,
                       lmStat.pagedone,
                       lmStat.full_page,
                       lmStat.partial_page);
+        return 0;
+}
-        begin = offset;
+static int jfs_lmstats_proc_open(struct inode *inode, struct file *file)
-        *start = buffer + begin;
+{
-        len -= begin;
+        return single_open(file, jfs_lmstats_proc_show, NULL);
-        if (len > length)
-                len = length;
-        else
-                *eof = 1;
-        if (len < 0)
-                len = 0;
-        return len;
 }
+const struct file_operations jfs_lmstats_proc_fops = {
+        .owner          = THIS_MODULE,
+        .open           = jfs_lmstats_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
 #endif /* CONFIG_JFS_STATISTICS */
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index d1e64f2f2fcd..854ff0ec574f 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -19,10 +19,12 @@
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/module.h>
 #include <linux/bio.h>
 #include <linux/init.h>
 #include <linux/buffer_head.h>
 #include <linux/mempool.h>
+#include <linux/seq_file.h>
 #include "jfs_incore.h"
 #include "jfs_superblock.h"
 #include "jfs_filsys.h"
@@ -804,13 +806,9 @@ void __invalidate_metapages(struct inode *ip, s64 addr, int len)
 }
 #ifdef CONFIG_JFS_STATISTICS
-int jfs_mpstat_read(char *buffer, char **start, off_t offset, int length,
+static int jfs_mpstat_proc_show(struct seq_file *m, void *v)
-                    int *eof, void *data)
 {
-        int len = 0;
+        seq_printf(m,
-        off_t begin;
-        len += sprintf(buffer,
                       "JFS Metapage statistics\n"
                       "=======================\n"
                       "page allocations = %d\n"
@@ -819,19 +817,19 @@ int jfs_mpstat_read(char *buffer, char **start, off_t offset, int length,
                       mpStat.pagealloc,
                       mpStat.pagefree,
                       mpStat.lockwait);
+        return 0;
+}
-        begin = offset;
+static int jfs_mpstat_proc_open(struct inode *inode, struct file *file)
-        *start = buffer + begin;
+{
-        len -= begin;
+        return single_open(file, jfs_mpstat_proc_show, NULL);
-        if (len > length)
-                len = length;
-        else
-                *eof = 1;
-        if (len < 0)
-                len = 0;
-        return len;
 }
+const struct file_operations jfs_mpstat_proc_fops = {
+        .owner          = THIS_MODULE,
+        .open           = jfs_mpstat_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
 #endif
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index e7c60ae6b5b2..f26e4d03ada5 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -49,6 +49,7 @@
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/kthread.h>
+#include <linux/seq_file.h>
 #include "jfs_incore.h"
 #include "jfs_inode.h"
 #include "jfs_filsys.h"
@@ -3009,11 +3010,8 @@ int jfs_sync(void *arg)
 }
 #if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_DEBUG)
-int jfs_txanchor_read(char *buffer, char **start, off_t offset, int length,
+static int jfs_txanchor_proc_show(struct seq_file *m, void *v)
-                      int *eof, void *data)
 {
-        int len = 0;
-        off_t begin;
        char *freewait;
        char *freelockwait;
        char *lowlockwait;
@@ -3025,7 +3023,7 @@ int jfs_txanchor_read(char *buffer, char **start, off_t offset, int length,
        lowlockwait =
            waitqueue_active(&TxAnchor.lowlockwait) ? "active" : "empty";
-        len += sprintf(buffer,
+        seq_printf(m,
                       "JFS TxAnchor\n"
                       "============\n"
                       "freetid = %d\n"
@@ -3044,31 +3042,27 @@ int jfs_txanchor_read(char *buffer, char **start, off_t offset, int length,
                       TxAnchor.tlocksInUse,
                       jfs_tlocks_low,
                       list_empty(&TxAnchor.unlock_queue) ? "" : "not ");
+        return 0;
+}
-        begin = offset;
+static int jfs_txanchor_proc_open(struct inode *inode, struct file *file)
-        *start = buffer + begin;
+{
-        len -= begin;
+        return single_open(file, jfs_txanchor_proc_show, NULL);
-        if (len > length)
-                len = length;
-        else
-                *eof = 1;
-        if (len < 0)
-                len = 0;
-        return len;
 }
+const struct file_operations jfs_txanchor_proc_fops = {
+        .owner          = THIS_MODULE,
+        .open           = jfs_txanchor_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
 #endif
 #if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_STATISTICS)
-int jfs_txstats_read(char *buffer, char **start, off_t offset, int length,
+static int jfs_txstats_proc_show(struct seq_file *m, void *v)
-                     int *eof, void *data)
 {
-        int len = 0;
+        seq_printf(m,
-        off_t begin;
-        len += sprintf(buffer,
                       "JFS TxStats\n"
                       "===========\n"
                       "calls to txBegin = %d\n"
@@ -3089,19 +3083,19 @@ int jfs_txstats_read(char *buffer, char **start, off_t offset, int length,
                       TxStat.txBeginAnon_lockslow,
                       TxStat.txLockAlloc,
                       TxStat.txLockAlloc_freelock);
+        return 0;
+}
-        begin = offset;
+static int jfs_txstats_proc_open(struct inode *inode, struct file *file)
-        *start = buffer + begin;
+{
-        len -= begin;
+        return single_open(file, jfs_txstats_proc_show, NULL);
-        if (len > length)
-                len = length;
-        else
-                *eof = 1;
-        if (len < 0)
-                len = 0;
-        return len;
 }
+const struct file_operations jfs_txstats_proc_fops = {
+        .owner          = THIS_MODULE,
+        .open           = jfs_txstats_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
 #endif
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index 5a61ebf2cbcc..ae3acafb447b 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -20,7 +20,9 @@
 */
 #include <linux/fs.h>
+#include <linux/module.h>
 #include <linux/quotaops.h>
+#include <linux/seq_file.h>
 #include "jfs_incore.h"
 #include "jfs_filsys.h"
 #include "jfs_metapage.h"
@@ -4134,13 +4136,9 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
 }
 #ifdef CONFIG_JFS_STATISTICS
-int jfs_xtstat_read(char *buffer, char **start, off_t offset, int length,
+static int jfs_xtstat_proc_show(struct seq_file *m, void *v)
-                    int *eof, void *data)
 {
-        int len = 0;
+        seq_printf(m,
-        off_t begin;
-        len += sprintf(buffer,
                       "JFS Xtree statistics\n"
                       "====================\n"
                       "searches = %d\n"
@@ -4149,19 +4147,19 @@ int jfs_xtstat_read(char *buffer, char **start, off_t offset, int length,
                       xtStat.search,
                       xtStat.fastSearch,
                       xtStat.split);
+        return 0;
+}
-        begin = offset;
+static int jfs_xtstat_proc_open(struct inode *inode, struct file *file)
-        *start = buffer + begin;
+{
-        len -= begin;
+        return single_open(file, jfs_xtstat_proc_show, NULL);
-        if (len > length)
-                len = length;
-        else
-                *eof = 1;
-        if (len < 0)
-                len = 0;
-        return len;
 }
+const struct file_operations jfs_xtstat_proc_fops = {
+        .owner          = THIS_MODULE,
+        .open           = jfs_xtstat_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
 #endif
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 0ba6778edaa2..2aba82386810 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1455,7 +1455,7 @@ static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struc
                free_UCSname(&key);
                if (rc == -ENOENT) {
                        d_add(dentry, NULL);
-                        return ERR_PTR(0);
+                        return NULL;
                } else if (rc) {
                        jfs_err("jfs_lookup: dtSearch returned %d", rc);
                        return ERR_PTR(rc);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 50ea65451732..0288e6d7936a 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -499,7 +499,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
        inode = jfs_iget(sb, ROOT_I);
        if (IS_ERR(inode)) {
                ret = PTR_ERR(inode);
-                goto out_no_root;
+                goto out_no_rw;
        }
        sb->s_root = d_alloc_root(inode);
        if (!sb->s_root)
@@ -521,9 +521,8 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
        return 0;
 out_no_root:
-        jfs_err("jfs_read_super: get root inode failed");
+        jfs_err("jfs_read_super: get root dentry failed");
-        if (inode)
+        iput(inode);
-                iput(inode);
 out_no_rw:
        rc = jfs_umount(sb);
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 5df517b81f3f..1f6dc518505c 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -224,7 +224,9 @@ void nlm_release_call(struct nlm_rqst *call)
 static void nlmclnt_rpc_release(void *data)
 {
+        lock_kernel();
        nlm_release_call(data);
+        unlock_kernel();
 }
 static int nlm_wait_on_grace(wait_queue_head_t *queue)
@@ -430,7 +432,7 @@ nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl)
                         * Report the conflicting lock back to the application.
                         */
                        fl->fl_start = req->a_res.lock.fl.fl_start;
-                        fl->fl_end = req->a_res.lock.fl.fl_start;
+                        fl->fl_end = req->a_res.lock.fl.fl_end;
                        fl->fl_type = req->a_res.lock.fl.fl_type;
                        fl->fl_pid = 0;
                        break;
@@ -710,7 +712,9 @@ static void nlmclnt_unlock_callback(struct rpc_task *task, void *data)
 die:
        return;
 retry_rebind:
+        lock_kernel();
        nlm_rebind_host(req->a_host);
+        unlock_kernel();
 retry_unlock:
        rpc_restart_call(task);
 }
@@ -788,7 +792,9 @@ retry_cancel:
        /* Don't ever retry more than 3 times */
        if (req->a_retries++ >= NLMCLNT_MAX_RETRIES)
                goto die;
+        lock_kernel();
        nlm_rebind_host(req->a_host);
+        unlock_kernel();
        rpc_restart_call(task);
        rpc_delay(task, 30 * HZ);
 }
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 385437e3387d..2e27176ff42f 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -248,7 +248,9 @@ static void nlm4svc_callback_exit(struct rpc_task *task, void *data)
 static void nlm4svc_callback_release(void *data)
 {
+        lock_kernel();
        nlm_release_call(data);
+        unlock_kernel();
 }
 static const struct rpc_call_ops nlm4svc_callback_ops = {
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 81aca859bfde..56a08ab9a4cb 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -795,6 +795,7 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
        dprintk("lockd: GRANT_MSG RPC callback\n");
+        lock_kernel();
        /* if the block is not on a list at this point then it has
         * been invalidated. Don't try to requeue it.
         *
@@ -804,7 +805,7 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
         * for nlm_blocked?
         */
        if (list_empty(&block->b_list))
-                return;
+                goto out;
        /* Technically, we should down the file semaphore here. Since we
         * move the block towards the head of the queue only, no harm
@@ -818,13 +819,17 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
        }
        nlmsvc_insert_block(block, timeout);
        svc_wake_up(block->b_daemon);
+out:
+        unlock_kernel();
 }
 static void nlmsvc_grant_release(void *data)
 {
        struct nlm_rqst         *call = data;
+        lock_kernel();
        nlmsvc_release_block(call->a_block);
+        unlock_kernel();
 }
 static const struct rpc_call_ops nlmsvc_grant_ops = {
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 88379cc6e0b1..ce6952b50a75 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -278,7 +278,9 @@ static void nlmsvc_callback_exit(struct rpc_task *task, void *data)
 static void nlmsvc_callback_release(void *data)
 {
+        lock_kernel();
        nlm_release_call(data);
+        unlock_kernel();
 }
 static const struct rpc_call_ops nlmsvc_callback_ops = {
diff --git a/fs/mpage.c b/fs/mpage.c
index 235e4d3873a8..dbcc7af76a15 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -82,7 +82,7 @@ static void mpage_end_io_write(struct bio *bio, int err)
        bio_put(bio);
 }
-static struct bio *mpage_bio_submit(int rw, struct bio *bio)
+struct bio *mpage_bio_submit(int rw, struct bio *bio)
 {
        bio->bi_end_io = mpage_end_io_read;
        if (rw == WRITE)
@@ -90,6 +90,7 @@ static struct bio *mpage_bio_submit(int rw, struct bio *bio)
        submit_bio(rw, bio);
        return NULL;
 }
+EXPORT_SYMBOL(mpage_bio_submit);
 static struct bio *
 mpage_alloc(struct block_device *bdev,
@@ -435,15 +436,9 @@ EXPORT_SYMBOL(mpage_readpage);
 * written, so it can intelligently allocate a suitably-sized BIO.  For now,
 * just allocate full-size (16-page) BIOs.
 */
-struct mpage_data {
-        struct bio *bio;
-        sector_t last_block_in_bio;
-        get_block_t *get_block;
-        unsigned use_writepage;
-};
-static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
+int __mpage_writepage(struct page *page, struct writeback_control *wbc,
-                             void *data)
+                      void *data)
 {
        struct mpage_data *mpd = data;
        struct bio *bio = mpd->bio;
@@ -651,6 +646,7 @@ out:
        mpd->bio = bio;
        return ret;
 }
+EXPORT_SYMBOL(__mpage_writepage);
 /**
 * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them
diff --git a/fs/msdos/namei.c b/fs/msdos/namei.c
index 05ff4f1d7026..1f7f2956412a 100644
--- a/fs/msdos/namei.c
+++ b/fs/msdos/namei.c
@@ -214,7 +214,7 @@ static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry,
        dentry->d_op = &msdos_dentry_operations;
-        lock_kernel();
+        lock_super(sb);
        res = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
        if (res == -ENOENT)
                goto add;
@@ -232,7 +232,7 @@ add:
        if (dentry)
                dentry->d_op = &msdos_dentry_operations;
 out:
-        unlock_kernel();
+        unlock_super(sb);
        if (!res)
                return dentry;
        return ERR_PTR(res);
@@ -286,7 +286,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, int mode,
        unsigned char msdos_name[MSDOS_NAME];
        int err, is_hid;
-        lock_kernel();
+        lock_super(sb);
        err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
                                msdos_name, &MSDOS_SB(sb)->options);
@@ -315,7 +315,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, int mode,
        d_instantiate(dentry, inode);
 out:
-        unlock_kernel();
+        unlock_super(sb);
        if (!err)
                err = fat_flush_inodes(sb, dir, inode);
        return err;
@@ -324,11 +324,12 @@ out:
 /***** Remove a directory */
 static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
 {
+        struct super_block *sb = dir->i_sb;
        struct inode *inode = dentry->d_inode;
        struct fat_slot_info sinfo;
        int err;
-        lock_kernel();
+        lock_super(sb);
        /*
         * Check whether the directory is not in use, then check
         * whether it is empty.
@@ -349,9 +350,9 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
        inode->i_ctime = CURRENT_TIME_SEC;
        fat_detach(inode);
 out:
-        unlock_kernel();
+        unlock_super(sb);
        if (!err)
-                err = fat_flush_inodes(inode->i_sb, dir, inode);
+                err = fat_flush_inodes(sb, dir, inode);
        return err;
 }
@@ -366,7 +367,7 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        struct timespec ts;
        int err, is_hid, cluster;
-        lock_kernel();
+        lock_super(sb);
        err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
                                msdos_name, &MSDOS_SB(sb)->options);
@@ -404,14 +405,14 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        d_instantiate(dentry, inode);
-        unlock_kernel();
+        unlock_super(sb);
        fat_flush_inodes(sb, dir, inode);
        return 0;
 out_free:
        fat_free_clusters(dir, cluster);
 out:
-        unlock_kernel();
+        unlock_super(sb);
        return err;
 }
@@ -419,10 +420,11 @@ out:
 static int msdos_unlink(struct inode *dir, struct dentry *dentry)
 {
        struct inode *inode = dentry->d_inode;
+        struct super_block *sb= inode->i_sb;
        struct fat_slot_info sinfo;
        int err;
-        lock_kernel();
+        lock_super(sb);
        err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
        if (err)
                goto out;
@@ -434,9 +436,9 @@ static int msdos_unlink(struct inode *dir, struct dentry *dentry)
        inode->i_ctime = CURRENT_TIME_SEC;
        fat_detach(inode);
 out:
-        unlock_kernel();
+        unlock_super(sb);
        if (!err)
-                err = fat_flush_inodes(inode->i_sb, dir, inode);
+                err = fat_flush_inodes(sb, dir, inode);
        return err;
 }
@@ -618,10 +620,11 @@ error_inode:
 static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry,
                        struct inode *new_dir, struct dentry *new_dentry)
 {
+        struct super_block *sb = old_dir->i_sb;
        unsigned char old_msdos_name[MSDOS_NAME], new_msdos_name[MSDOS_NAME];
        int err, is_hid;
-        lock_kernel();
+        lock_super(sb);
        err = msdos_format_name(old_dentry->d_name.name,
                                old_dentry->d_name.len, old_msdos_name,
@@ -640,9 +643,9 @@ static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry,
        err = do_msdos_rename(old_dir, old_msdos_name, old_dentry,
                              new_dir, new_msdos_name, new_dentry, is_hid);
 out:
-        unlock_kernel();
+        unlock_super(sb);
        if (!err)
-                err = fat_flush_inodes(old_dir->i_sb, old_dir, new_dir);
+                err = fat_flush_inodes(sb, old_dir, new_dir);
        return err;
 }
diff --git a/fs/namespace.c b/fs/namespace.c
index 4fc302c2a0e0..4f6f7635b59c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -750,7 +750,7 @@ struct proc_fs_info {
        const char *str;
 };
-static void show_sb_opts(struct seq_file *m, struct super_block *sb)
+static int show_sb_opts(struct seq_file *m, struct super_block *sb)
 {
        static const struct proc_fs_info fs_info[] = {
                { MS_SYNCHRONOUS, ",sync" },
@@ -764,6 +764,8 @@ static void show_sb_opts(struct seq_file *m, struct super_block *sb)
                if (sb->s_flags & fs_infop->flag)
                        seq_puts(m, fs_infop->str);
        }
+        return security_sb_show_options(m, sb);
 }
 static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
@@ -806,11 +808,14 @@ static int show_vfsmnt(struct seq_file *m, void *v)
        seq_putc(m, ' ');
        show_type(m, mnt->mnt_sb);
        seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw");
-        show_sb_opts(m, mnt->mnt_sb);
+        err = show_sb_opts(m, mnt->mnt_sb);
+        if (err)
+                goto out;
        show_mnt_opts(m, mnt);
        if (mnt->mnt_sb->s_op->show_options)
                err = mnt->mnt_sb->s_op->show_options(m, mnt);
        seq_puts(m, " 0 0\n");
+out:
        return err;
 }
@@ -865,10 +870,13 @@ static int show_mountinfo(struct seq_file *m, void *v)
        seq_putc(m, ' ');
        mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
        seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw");
-        show_sb_opts(m, sb);
+        err = show_sb_opts(m, sb);
+        if (err)
+                goto out;
        if (sb->s_op->show_options)
                err = sb->s_op->show_options(m, mnt);
        seq_putc(m, '\n');
+out:
        return err;
 }
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 2b145de45b39..6a7d901f1936 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -18,6 +18,7 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/sched.h>
+#include <linux/smp_lock.h>
 #include <linux/ncp_fs.h>
 #include "ncplib_kernel.h"
@@ -281,9 +282,18 @@ static int ncp_release(struct inode *inode, struct file *file) {
        return 0;
 }
+static loff_t ncp_remote_llseek(struct file *file, loff_t offset, int origin)
+{
+        loff_t ret;
+        lock_kernel();
+        ret = generic_file_llseek_unlocked(file, offset, origin);
+        unlock_kernel();
+        return ret;
+}
 const struct file_operations ncp_file_operations =
 {
-        .llseek         = remote_llseek,
+        .llseek         = ncp_remote_llseek,
        .read           = ncp_file_read,
        .write          = ncp_file_write,
        .ioctl          = ncp_ioctl,
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index c1e7c8300629..f447f4b4476c 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -27,7 +27,7 @@
 struct nfs_callback_data {
        unsigned int users;
-        struct svc_serv *serv;
+        struct svc_rqst *rqst;
        struct task_struct *task;
 };
@@ -91,21 +91,17 @@ nfs_callback_svc(void *vrqstp)
                svc_process(rqstp);
        }
        unlock_kernel();
-        nfs_callback_info.task = NULL;
-        svc_exit_thread(rqstp);
        return 0;
 }
 /*
- * Bring up the server process if it is not already up.
+ * Bring up the callback thread if it is not already up.
 */
 int nfs_callback_up(void)
 {
        struct svc_serv *serv = NULL;
-        struct svc_rqst *rqstp;
        int ret = 0;
-        lock_kernel();
        mutex_lock(&nfs_callback_mutex);
        if (nfs_callback_info.users++ || nfs_callback_info.task != NULL)
                goto out;
@@ -121,22 +117,23 @@ int nfs_callback_up(void)
        nfs_callback_tcpport = ret;
        dprintk("Callback port = 0x%x\n", nfs_callback_tcpport);
-        rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]);
+        nfs_callback_info.rqst = svc_prepare_thread(serv, &serv->sv_pools[0]);
-        if (IS_ERR(rqstp)) {
+        if (IS_ERR(nfs_callback_info.rqst)) {
-                ret = PTR_ERR(rqstp);
+                ret = PTR_ERR(nfs_callback_info.rqst);
+                nfs_callback_info.rqst = NULL;
                goto out_err;
        }
        svc_sock_update_bufs(serv);
-        nfs_callback_info.serv = serv;
-        nfs_callback_info.task = kthread_run(nfs_callback_svc, rqstp,
+        nfs_callback_info.task = kthread_run(nfs_callback_svc,
+                                             nfs_callback_info.rqst,
                                             "nfsv4-svc");
        if (IS_ERR(nfs_callback_info.task)) {
                ret = PTR_ERR(nfs_callback_info.task);
-                nfs_callback_info.serv = NULL;
+                svc_exit_thread(nfs_callback_info.rqst);
+                nfs_callback_info.rqst = NULL;
                nfs_callback_info.task = NULL;
-                svc_exit_thread(rqstp);
                goto out_err;
        }
 out:
@@ -149,7 +146,6 @@ out:
        if (serv)
                svc_destroy(serv);
        mutex_unlock(&nfs_callback_mutex);
-        unlock_kernel();
        return ret;
 out_err:
        dprintk("Couldn't create callback socket or server thread; err = %d\n",
@@ -159,17 +155,19 @@ out_err:
 }
 /*
- * Kill the server process if it is not already down.
+ * Kill the callback thread if it's no longer being used.
 */
 void nfs_callback_down(void)
 {
-        lock_kernel();
        mutex_lock(&nfs_callback_mutex);
        nfs_callback_info.users--;
-        if (nfs_callback_info.users == 0 && nfs_callback_info.task != NULL)
+        if (nfs_callback_info.users == 0 && nfs_callback_info.task != NULL) {
                kthread_stop(nfs_callback_info.task);
+                svc_exit_thread(nfs_callback_info.rqst);
+                nfs_callback_info.rqst = NULL;
+                nfs_callback_info.task = NULL;
+        }
        mutex_unlock(&nfs_callback_mutex);
-        unlock_kernel();
 }
 static int nfs_callback_authenticate(struct svc_rqst *rqstp)
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index f2a092ca69b5..5ee23e7058b3 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -431,14 +431,14 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
 {
        to->to_initval = timeo * HZ / 10;
        to->to_retries = retrans;
-        if (!to->to_retries)
-                to->to_retries = 2;
        switch (proto) {
        case XPRT_TRANSPORT_TCP:
        case XPRT_TRANSPORT_RDMA:
+                if (to->to_retries == 0)
+                        to->to_retries = NFS_DEF_TCP_RETRANS;
                if (to->to_initval == 0)
-                        to->to_initval = 60 * HZ;
+                        to->to_initval = NFS_DEF_TCP_TIMEO * HZ / 10;
                if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
                        to->to_initval = NFS_MAX_TCP_TIMEOUT;
                to->to_increment = to->to_initval;
@@ -450,14 +450,17 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
                to->to_exponential = 0;
                break;
        case XPRT_TRANSPORT_UDP:
-        default:
+                if (to->to_retries == 0)
+                        to->to_retries = NFS_DEF_UDP_RETRANS;
                if (!to->to_initval)
-                        to->to_initval = 11 * HZ / 10;
+                        to->to_initval = NFS_DEF_UDP_TIMEO * HZ / 10;
                if (to->to_initval > NFS_MAX_UDP_TIMEOUT)
                        to->to_initval = NFS_MAX_UDP_TIMEOUT;
                to->to_maxval = NFS_MAX_UDP_TIMEOUT;
                to->to_exponential = 1;
                break;
+        default:
+                BUG();
        }
 }
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 982a2064fe4c..28a238dab23a 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -133,13 +133,14 @@ nfs_opendir(struct inode *inode, struct file *filp)
 {
        int res;
-        dfprintk(VFS, "NFS: opendir(%s/%ld)\n",
+        dfprintk(FILE, "NFS: open dir(%s/%s)\n",
-                        inode->i_sb->s_id, inode->i_ino);
+                        filp->f_path.dentry->d_parent->d_name.name,
+                        filp->f_path.dentry->d_name.name);
+        nfs_inc_stats(inode, NFSIOS_VFSOPEN);
-        lock_kernel();
        /* Call generic open code in order to cache credentials */
        res = nfs_open(inode, filp);
-        unlock_kernel();
        return res;
 }
@@ -528,13 +529,11 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        struct nfs_fattr fattr;
        long            res;
-        dfprintk(VFS, "NFS: readdir(%s/%s) starting at cookie %Lu\n",
+        dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
                        dentry->d_parent->d_name.name, dentry->d_name.name,
                        (long long)filp->f_pos);
        nfs_inc_stats(inode, NFSIOS_VFSGETDENTS);
-        lock_kernel();
        /*
         * filp->f_pos points to the dirent entry number.
         * *desc->dir_cookie has the cookie for the next entry. We have
@@ -592,10 +591,9 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        }
 out:
        nfs_unblock_sillyrename(dentry);
-        unlock_kernel();
        if (res > 0)
                res = 0;
-        dfprintk(VFS, "NFS: readdir(%s/%s) returns %ld\n",
+        dfprintk(FILE, "NFS: readdir(%s/%s) returns %ld\n",
                        dentry->d_parent->d_name.name, dentry->d_name.name,
                        res);
        return res;
@@ -603,7 +601,15 @@ out:
 static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin)
 {
-        mutex_lock(&filp->f_path.dentry->d_inode->i_mutex);
+        struct dentry *dentry = filp->f_path.dentry;
+        struct inode *inode = dentry->d_inode;
+        dfprintk(FILE, "NFS: llseek dir(%s/%s, %lld, %d)\n",
+                        dentry->d_parent->d_name.name,
+                        dentry->d_name.name,
+                        offset, origin);
+        mutex_lock(&inode->i_mutex);
        switch (origin) {
                case 1:
                        offset += filp->f_pos;
@@ -619,7 +625,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin)
                nfs_file_open_context(filp)->dir_cookie = 0;
        }
 out:
-        mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex);
+        mutex_unlock(&inode->i_mutex);
        return offset;
 }
@@ -629,10 +635,11 @@ out:
 */
 static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync)
 {
-        dfprintk(VFS, "NFS: fsync_dir(%s/%s) datasync %d\n",
+        dfprintk(FILE, "NFS: fsync dir(%s/%s) datasync %d\n",
                        dentry->d_parent->d_name.name, dentry->d_name.name,
                        datasync);
+        nfs_inc_stats(dentry->d_inode, NFSIOS_VFSFSYNC);
        return 0;
 }
@@ -767,7 +774,6 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
        struct nfs_fattr fattr;
        parent = dget_parent(dentry);
-        lock_kernel();
        dir = parent->d_inode;
        nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
        inode = dentry->d_inode;
@@ -805,7 +811,6 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
        nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 out_valid:
-        unlock_kernel();
        dput(parent);
        dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is valid\n",
                        __func__, dentry->d_parent->d_name.name,
@@ -824,7 +829,6 @@ out_zap_parent:
                shrink_dcache_parent(dentry);
        }
        d_drop(dentry);
-        unlock_kernel();
        dput(parent);
        dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n",
                        __func__, dentry->d_parent->d_name.name,
@@ -858,6 +862,14 @@ static int nfs_dentry_delete(struct dentry *dentry)
 }
+static void nfs_drop_nlink(struct inode *inode)
+{
+        spin_lock(&inode->i_lock);
+        if (inode->i_nlink > 0)
+                drop_nlink(inode);
+        spin_unlock(&inode->i_lock);
+}
 /*
 * Called when the dentry loses inode.
 * We use it to clean up silly-renamed files.
@@ -869,10 +881,8 @@ static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
                NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA;
        if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
-                lock_kernel();
                drop_nlink(inode);
                nfs_complete_unlink(dentry, inode);
-                unlock_kernel();
        }
        iput(inode);
 }
@@ -903,8 +913,6 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
        res = ERR_PTR(-ENOMEM);
        dentry->d_op = NFS_PROTO(dir)->dentry_ops;
-        lock_kernel();
        /*
         * If we're doing an exclusive create, optimize away the lookup
         * but don't hash the dentry.
@@ -912,7 +920,7 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
        if (nfs_is_exclusive_create(dir, nd)) {
                d_instantiate(dentry, NULL);
                res = NULL;
-                goto out_unlock;
+                goto out;
        }
        parent = dentry->d_parent;
@@ -940,8 +948,6 @@ no_entry:
        nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 out_unblock_sillyrename:
        nfs_unblock_sillyrename(parent);
-out_unlock:
-        unlock_kernel();
 out:
        return res;
 }
@@ -999,9 +1005,7 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
        }
        /* Open the file on the server */
-        lock_kernel();
        res = nfs4_atomic_open(dir, dentry, nd);
-        unlock_kernel();
        if (IS_ERR(res)) {
                error = PTR_ERR(res);
                switch (error) {
@@ -1063,9 +1067,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
         * operations that change the directory. We therefore save the
         * change attribute *before* we do the RPC call.
         */
-        lock_kernel();
        ret = nfs4_open_revalidate(dir, dentry, openflags, nd);
-        unlock_kernel();
 out:
        dput(parent);
        if (!ret)
@@ -1218,14 +1220,11 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
        if ((nd->flags & LOOKUP_CREATE) != 0)
                open_flags = nd->intent.open.flags;
-        lock_kernel();
        error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, nd);
        if (error != 0)
                goto out_err;
-        unlock_kernel();
        return 0;
 out_err:
-        unlock_kernel();
        d_drop(dentry);
        return error;
 }
@@ -1248,14 +1247,11 @@ nfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
        attr.ia_mode = mode;
        attr.ia_valid = ATTR_MODE;
-        lock_kernel();
        status = NFS_PROTO(dir)->mknod(dir, dentry, &attr, rdev);
        if (status != 0)
                goto out_err;
-        unlock_kernel();
        return 0;
 out_err:
-        unlock_kernel();
        d_drop(dentry);
        return status;
 }
@@ -1274,15 +1270,12 @@ static int nfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        attr.ia_valid = ATTR_MODE;
        attr.ia_mode = mode | S_IFDIR;
-        lock_kernel();
        error = NFS_PROTO(dir)->mkdir(dir, dentry, &attr);
        if (error != 0)
                goto out_err;
-        unlock_kernel();
        return 0;
 out_err:
        d_drop(dentry);
-        unlock_kernel();
        return error;
 }
@@ -1299,14 +1292,12 @@ static int nfs_rmdir(struct inode *dir, struct dentry *dentry)
        dfprintk(VFS, "NFS: rmdir(%s/%ld), %s\n",
                        dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
-        lock_kernel();
        error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
        /* Ensure the VFS deletes this inode */
        if (error == 0 && dentry->d_inode != NULL)
                clear_nlink(dentry->d_inode);
        else if (error == -ENOENT)
                nfs_dentry_handle_enoent(dentry);
-        unlock_kernel();
        return error;
 }
@@ -1408,7 +1399,7 @@ static int nfs_safe_remove(struct dentry *dentry)
                error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
                /* The VFS may want to delete this inode */
                if (error == 0)
-                        drop_nlink(inode);
+                        nfs_drop_nlink(inode);
                nfs_mark_for_revalidate(inode);
        } else
                error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
@@ -1431,7 +1422,6 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
        dfprintk(VFS, "NFS: unlink(%s/%ld, %s)\n", dir->i_sb->s_id,
                dir->i_ino, dentry->d_name.name);
-        lock_kernel();
        spin_lock(&dcache_lock);
        spin_lock(&dentry->d_lock);
        if (atomic_read(&dentry->d_count) > 1) {
@@ -1440,7 +1430,6 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
                /* Start asynchronous writeout of the inode */
                write_inode_now(dentry->d_inode, 0);
                error = nfs_sillyrename(dir, dentry);
-                unlock_kernel();
                return error;
        }
        if (!d_unhashed(dentry)) {
@@ -1454,7 +1443,6 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
                nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
        } else if (need_rehash)
                d_rehash(dentry);
-        unlock_kernel();
        return error;
 }
@@ -1491,13 +1479,9 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym
        attr.ia_mode = S_IFLNK | S_IRWXUGO;
        attr.ia_valid = ATTR_MODE;
-        lock_kernel();
        page = alloc_page(GFP_HIGHUSER);
-        if (!page) {
+        if (!page)
-                unlock_kernel();
                return -ENOMEM;
-        }
        kaddr = kmap_atomic(page, KM_USER0);
        memcpy(kaddr, symname, pathlen);
@@ -1512,7 +1496,6 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym
                        dentry->d_name.name, symname, error);
                d_drop(dentry);
                __free_page(page);
-                unlock_kernel();
                return error;
        }
@@ -1530,7 +1513,6 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym
        } else
                __free_page(page);
-        unlock_kernel();
        return 0;
 }
@@ -1544,14 +1526,12 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
                old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
                dentry->d_parent->d_name.name, dentry->d_name.name);
-        lock_kernel();
        d_drop(dentry);
        error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name);
        if (error == 0) {
                atomic_inc(&inode->i_count);
                d_add(dentry, inode);
        }
-        unlock_kernel();
        return error;
 }
@@ -1591,7 +1571,6 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
         * To prevent any new references to the target during the rename,
         * we unhash the dentry and free the inode in advance.
         */
-        lock_kernel();
        if (!d_unhashed(new_dentry)) {
                d_drop(new_dentry);
                rehash = new_dentry;
@@ -1635,7 +1614,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                        /* dentry still busy? */
                        goto out;
        } else
-                drop_nlink(new_inode);
+                nfs_drop_nlink(new_inode);
 go_ahead:
        /*
@@ -1669,7 +1648,6 @@ out:
        /* new dentry created? */
        if (dentry)
                dput(dentry);
-        unlock_kernel();
        return error;
 }
@@ -1962,8 +1940,6 @@ int nfs_permission(struct inode *inode, int mask, struct nameidata *nd)
        }
 force_lookup:
-        lock_kernel();
        if (!NFS_PROTO(inode)->access)
                goto out_notsup;
@@ -1973,7 +1949,6 @@ force_lookup:
                put_rpccred(cred);
        } else
                res = PTR_ERR(cred);
-        unlock_kernel();
 out:
        dfprintk(VFS, "NFS: permission(%s/%ld), mask=0x%x, res=%d\n",
                inode->i_sb->s_id, inode->i_ino, mask, res);
@@ -1982,7 +1957,6 @@ out_notsup:
        res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
        if (res == 0)
                res = generic_permission(inode, mask, NULL);
-        unlock_kernel();
        goto out;
 }
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 4757a2b326a1..08f6b040d289 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -890,7 +890,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
        count = iov_length(iov, nr_segs);
        nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
-        dprintk("nfs: direct read(%s/%s, %zd@%Ld)\n",
+        dfprintk(FILE, "NFS: direct read(%s/%s, %zd@%Ld)\n",
                file->f_path.dentry->d_parent->d_name.name,
                file->f_path.dentry->d_name.name,
                count, (long long) pos);
@@ -947,7 +947,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
        count = iov_length(iov, nr_segs);
        nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
-        dfprintk(VFS, "nfs: direct write(%s/%s, %zd@%Ld)\n",
+        dfprintk(FILE, "NFS: direct write(%s/%s, %zd@%Ld)\n",
                file->f_path.dentry->d_parent->d_name.name,
                file->f_path.dentry->d_name.name,
                count, (long long) pos);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index d84a3d8f32af..78460657f5cb 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -50,7 +50,7 @@ static ssize_t nfs_file_read(struct kiocb *, const struct iovec *iov,
 static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov,
                                unsigned long nr_segs, loff_t pos);
 static int  nfs_file_flush(struct file *, fl_owner_t id);
-static int  nfs_fsync(struct file *, struct dentry *dentry, int datasync);
+static int  nfs_file_fsync(struct file *, struct dentry *dentry, int datasync);
 static int nfs_check_flags(int flags);
 static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
 static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl);
@@ -72,7 +72,7 @@ const struct file_operations nfs_file_operations = {
        .open           = nfs_file_open,
        .flush          = nfs_file_flush,
        .release        = nfs_file_release,
-        .fsync          = nfs_fsync,
+        .fsync          = nfs_file_fsync,
        .lock           = nfs_lock,
        .flock          = nfs_flock,
        .splice_read    = nfs_file_splice_read,
@@ -119,25 +119,33 @@ nfs_file_open(struct inode *inode, struct file *filp)
 {
        int res;
+        dprintk("NFS: open file(%s/%s)\n",
+                        filp->f_path.dentry->d_parent->d_name.name,
+                        filp->f_path.dentry->d_name.name);
        res = nfs_check_flags(filp->f_flags);
        if (res)
                return res;
        nfs_inc_stats(inode, NFSIOS_VFSOPEN);
-        lock_kernel();
+        res = nfs_open(inode, filp);
-        res = NFS_PROTO(inode)->file_open(inode, filp);
-        unlock_kernel();
        return res;
 }
 static int
 nfs_file_release(struct inode *inode, struct file *filp)
 {
+        struct dentry *dentry = filp->f_path.dentry;
+        dprintk("NFS: release(%s/%s)\n",
+                        dentry->d_parent->d_name.name,
+                        dentry->d_name.name);
        /* Ensure that dirty pages are flushed out with the right creds */
        if (filp->f_mode & FMODE_WRITE)
-                nfs_wb_all(filp->f_path.dentry->d_inode);
+                nfs_wb_all(dentry->d_inode);
        nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
-        return NFS_PROTO(inode)->file_release(inode, filp);
+        return nfs_release(inode, filp);
 }
 /**
@@ -170,6 +178,13 @@ force_reval:
 static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
 {
+        loff_t loff;
+        dprintk("NFS: llseek file(%s/%s, %lld, %d)\n",
+                        filp->f_path.dentry->d_parent->d_name.name,
+                        filp->f_path.dentry->d_name.name,
+                        offset, origin);
        /* origin == SEEK_END => we must revalidate the cached file length */
        if (origin == SEEK_END) {
                struct inode *inode = filp->f_mapping->host;
@@ -177,11 +192,14 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
                if (retval < 0)
                        return (loff_t)retval;
        }
-        return remote_llseek(filp, offset, origin);
+        lock_kernel();  /* BKL needed? */
+        loff = generic_file_llseek_unlocked(filp, offset, origin);
+        unlock_kernel();
+        return loff;
 }
 /*
- * Helper for nfs_file_flush() and nfs_fsync()
+ * Helper for nfs_file_flush() and nfs_file_fsync()
 *
 * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to
 * disk, but it retrieves and clears ctx->error after synching, despite
@@ -207,16 +225,18 @@ static int nfs_do_fsync(struct nfs_open_context *ctx, struct inode *inode)
 /*
 * Flush all dirty pages, and check for write errors.
- *
 */
 static int
 nfs_file_flush(struct file *file, fl_owner_t id)
 {
        struct nfs_open_context *ctx = nfs_file_open_context(file);
-        struct inode    *inode = file->f_path.dentry->d_inode;
+        struct dentry   *dentry = file->f_path.dentry;
+        struct inode    *inode = dentry->d_inode;
        int             status;
-        dfprintk(VFS, "nfs: flush(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino);
+        dprintk("NFS: flush(%s/%s)\n",
+                        dentry->d_parent->d_name.name,
+                        dentry->d_name.name);
        if ((file->f_mode & FMODE_WRITE) == 0)
                return 0;
@@ -241,7 +261,7 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov,
        if (iocb->ki_filp->f_flags & O_DIRECT)
                return nfs_file_direct_read(iocb, iov, nr_segs, pos);
-        dfprintk(VFS, "nfs: read(%s/%s, %lu@%lu)\n",
+        dprintk("NFS: read(%s/%s, %lu@%lu)\n",
                dentry->d_parent->d_name.name, dentry->d_name.name,
                (unsigned long) count, (unsigned long) pos);
@@ -261,7 +281,7 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos,
        struct inode *inode = dentry->d_inode;
        ssize_t res;
-        dfprintk(VFS, "nfs: splice_read(%s/%s, %lu@%Lu)\n",
+        dprintk("NFS: splice_read(%s/%s, %lu@%Lu)\n",
                dentry->d_parent->d_name.name, dentry->d_name.name,
                (unsigned long) count, (unsigned long long) *ppos);
@@ -278,7 +298,7 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
        struct inode *inode = dentry->d_inode;
        int     status;
-        dfprintk(VFS, "nfs: mmap(%s/%s)\n",
+        dprintk("NFS: mmap(%s/%s)\n",
                dentry->d_parent->d_name.name, dentry->d_name.name);
        status = nfs_revalidate_mapping(inode, file->f_mapping);
@@ -296,12 +316,14 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
 * whether any write errors occurred for this process.
 */
 static int
-nfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+nfs_file_fsync(struct file *file, struct dentry *dentry, int datasync)
 {
        struct nfs_open_context *ctx = nfs_file_open_context(file);
        struct inode *inode = dentry->d_inode;
-        dfprintk(VFS, "nfs: fsync(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino);
+        dprintk("NFS: fsync file(%s/%s) datasync %d\n",
+                        dentry->d_parent->d_name.name, dentry->d_name.name,
+                        datasync);
        nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
        return nfs_do_fsync(ctx, inode);
@@ -324,6 +346,11 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
        struct page *page;
        index = pos >> PAGE_CACHE_SHIFT;
+        dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n",
+                file->f_path.dentry->d_parent->d_name.name,
+                file->f_path.dentry->d_name.name,
+                mapping->host->i_ino, len, (long long) pos);
        page = __grab_cache_page(mapping, index);
        if (!page)
                return -ENOMEM;
@@ -344,9 +371,32 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
        unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
        int status;
-        lock_kernel();
+        dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n",
+                file->f_path.dentry->d_parent->d_name.name,
+                file->f_path.dentry->d_name.name,
+                mapping->host->i_ino, len, (long long) pos);
+        /*
+         * Zero any uninitialised parts of the page, and then mark the page
+         * as up to date if it turns out that we're extending the file.
+         */
+        if (!PageUptodate(page)) {
+                unsigned pglen = nfs_page_length(page);
+                unsigned end = offset + len;
+                if (pglen == 0) {
+                        zero_user_segments(page, 0, offset,
+                                        end, PAGE_CACHE_SIZE);
+                        SetPageUptodate(page);
+                } else if (end >= pglen) {
+                        zero_user_segment(page, end, PAGE_CACHE_SIZE);
+                        if (offset == 0)
+                                SetPageUptodate(page);
+                } else
+                        zero_user_segment(page, pglen, PAGE_CACHE_SIZE);
+        }
        status = nfs_updatepage(file, page, offset, copied);
-        unlock_kernel();
        unlock_page(page);
        page_cache_release(page);
@@ -358,6 +408,8 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
 static void nfs_invalidate_page(struct page *page, unsigned long offset)
 {
+        dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset);
        if (offset != 0)
                return;
        /* Cancel any unstarted writes on this page */
@@ -366,13 +418,20 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset)
 static int nfs_release_page(struct page *page, gfp_t gfp)
 {
+        dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
        /* If PagePrivate() is set, then the page is not freeable */
        return 0;
 }
 static int nfs_launder_page(struct page *page)
 {
-        return nfs_wb_page(page->mapping->host, page);
+        struct inode *inode = page->mapping->host;
+        dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n",
+                inode->i_ino, (long long)page_offset(page));
+        return nfs_wb_page(inode, page);
 }
 const struct address_space_operations nfs_file_aops = {
@@ -392,13 +451,19 @@ const struct address_space_operations nfs_file_aops = {
 static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 {
        struct file *filp = vma->vm_file;
+        struct dentry *dentry = filp->f_path.dentry;
        unsigned pagelen;
        int ret = -EINVAL;
        struct address_space *mapping;
+        dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%s/%s(%ld), offset %lld)\n",
+                dentry->d_parent->d_name.name, dentry->d_name.name,
+                filp->f_mapping->host->i_ino,
+                (long long)page_offset(page));
        lock_page(page);
        mapping = page->mapping;
-        if (mapping != vma->vm_file->f_path.dentry->d_inode->i_mapping)
+        if (mapping != dentry->d_inode->i_mapping)
                goto out_unlock;
        ret = 0;
@@ -446,9 +511,9 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
        if (iocb->ki_filp->f_flags & O_DIRECT)
                return nfs_file_direct_write(iocb, iov, nr_segs, pos);
-        dfprintk(VFS, "nfs: write(%s/%s(%ld), %lu@%Ld)\n",
+        dprintk("NFS: write(%s/%s, %lu@%Ld)\n",
                dentry->d_parent->d_name.name, dentry->d_name.name,
-                inode->i_ino, (unsigned long) count, (long long) pos);
+                (unsigned long) count, (long long) pos);
        result = -EBUSY;
        if (IS_SWAPFILE(inode))
@@ -582,7 +647,8 @@ static int do_setlk(struct file *filp, int cmd, struct file_lock *fl)
         * This makes locking act as a cache coherency point.
         */
        nfs_sync_mapping(filp->f_mapping);
-        nfs_zap_caches(inode);
+        if (!nfs_have_delegation(inode, FMODE_READ))
+                nfs_zap_caches(inode);
 out:
        return status;
 }
@@ -592,23 +658,35 @@ out:
 */
 static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
 {
-        struct inode * inode = filp->f_mapping->host;
+        struct inode *inode = filp->f_mapping->host;
+        int ret = -ENOLCK;
-        dprintk("NFS: nfs_lock(f=%s/%ld, t=%x, fl=%x, r=%Ld:%Ld)\n",
+        dprintk("NFS: lock(%s/%s, t=%x, fl=%x, r=%lld:%lld)\n",
-                        inode->i_sb->s_id, inode->i_ino,
+                        filp->f_path.dentry->d_parent->d_name.name,
+                        filp->f_path.dentry->d_name.name,
                        fl->fl_type, fl->fl_flags,
                        (long long)fl->fl_start, (long long)fl->fl_end);
        nfs_inc_stats(inode, NFSIOS_VFSLOCK);
        /* No mandatory locks over NFS */
        if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
-                return -ENOLCK;
+                goto out_err;
+        if (NFS_PROTO(inode)->lock_check_bounds != NULL) {
+                ret = NFS_PROTO(inode)->lock_check_bounds(fl);
+                if (ret < 0)
+                        goto out_err;
+        }
        if (IS_GETLK(cmd))
-                return do_getlk(filp, cmd, fl);
+                ret = do_getlk(filp, cmd, fl);
-        if (fl->fl_type == F_UNLCK)
+        else if (fl->fl_type == F_UNLCK)
-                return do_unlk(filp, cmd, fl);
+                ret = do_unlk(filp, cmd, fl);
-        return do_setlk(filp, cmd, fl);
+        else
+                ret = do_setlk(filp, cmd, fl);
+out_err:
+        return ret;
 }
 /*
@@ -616,9 +694,9 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
 */
 static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
 {
-        dprintk("NFS: nfs_flock(f=%s/%ld, t=%x, fl=%x)\n",
+        dprintk("NFS: flock(%s/%s, t=%x, fl=%x)\n",
-                        filp->f_path.dentry->d_inode->i_sb->s_id,
+                        filp->f_path.dentry->d_parent->d_name.name,
-                        filp->f_path.dentry->d_inode->i_ino,
+                        filp->f_path.dentry->d_name.name,
                        fl->fl_type, fl->fl_flags);
        /*
@@ -641,12 +719,15 @@ static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
        return do_setlk(filp, cmd, fl);
 }
+/*
+ * There is no protocol support for leases, so we have no way to implement
+ * them correctly in the face of opens by other clients.
+ */
 static int nfs_setlease(struct file *file, long arg, struct file_lock **fl)
 {
-        /*
+        dprintk("NFS: setlease(%s/%s, arg=%ld)\n",
-         * There is no protocol support for leases, so we have no way
+                        file->f_path.dentry->d_parent->d_name.name,
-         * to implement them correctly in the face of opens by other
+                        file->f_path.dentry->d_name.name, arg);
-         * clients.
-         */
        return -EINVAL;
 }
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 596c5d8e86f4..df23f987da6b 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -57,8 +57,6 @@ static int enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED;
 static void nfs_invalidate_inode(struct inode *);
 static int nfs_update_inode(struct inode *, struct nfs_fattr *);
-static void nfs_zap_acl_cache(struct inode *);
 static struct kmem_cache * nfs_inode_cachep;
 static inline unsigned long
@@ -167,7 +165,7 @@ void nfs_zap_mapping(struct inode *inode, struct address_space *mapping)
        }
 }
-static void nfs_zap_acl_cache(struct inode *inode)
+void nfs_zap_acl_cache(struct inode *inode)
 {
        void (*clear_acl_cache)(struct inode *);
@@ -347,7 +345,7 @@ out_no_inode:
        goto out;
 }
-#define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET)
+#define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_FILE)
 int
 nfs_setattr(struct dentry *dentry, struct iattr *attr)
@@ -369,10 +367,9 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
        /* Optimization: if the end result is no change, don't RPC */
        attr->ia_valid &= NFS_VALID_ATTRS;
-        if (attr->ia_valid == 0)
+        if ((attr->ia_valid & ~ATTR_FILE) == 0)
                return 0;
-        lock_kernel();
        /* Write all dirty data */
        if (S_ISREG(inode->i_mode)) {
                filemap_write_and_wait(inode->i_mapping);
@@ -386,11 +383,66 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
        error = NFS_PROTO(inode)->setattr(dentry, &fattr, attr);
        if (error == 0)
                nfs_refresh_inode(inode, &fattr);
-        unlock_kernel();
        return error;
 }
 /**
+ * nfs_vmtruncate - unmap mappings "freed" by truncate() syscall
+ * @inode: inode of the file used
+ * @offset: file offset to start truncating
+ *
+ * This is a copy of the common vmtruncate, but with the locking
+ * corrected to take into account the fact that NFS requires
+ * inode->i_size to be updated under the inode->i_lock.
+ */
+static int nfs_vmtruncate(struct inode * inode, loff_t offset)
+{
+        if (i_size_read(inode) < offset) {
+                unsigned long limit;
+                limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+                if (limit != RLIM_INFINITY && offset > limit)
+                        goto out_sig;
+                if (offset > inode->i_sb->s_maxbytes)
+                        goto out_big;
+                spin_lock(&inode->i_lock);
+                i_size_write(inode, offset);
+                spin_unlock(&inode->i_lock);
+        } else {
+                struct address_space *mapping = inode->i_mapping;
+                /*
+                 * truncation of in-use swapfiles is disallowed - it would
+                 * cause subsequent swapout to scribble on the now-freed
+                 * blocks.
+                 */
+                if (IS_SWAPFILE(inode))
+                        return -ETXTBSY;
+                spin_lock(&inode->i_lock);
+                i_size_write(inode, offset);
+                spin_unlock(&inode->i_lock);
+                /*
+                 * unmap_mapping_range is called twice, first simply for
+                 * efficiency so that truncate_inode_pages does fewer
+                 * single-page unmaps.  However after this first call, and
+                 * before truncate_inode_pages finishes, it is possible for
+                 * private pages to be COWed, which remain after
+                 * truncate_inode_pages finishes, hence the second
+                 * unmap_mapping_range call must be made for correctness.
+                 */
+                unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
+                truncate_inode_pages(mapping, offset);
+                unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
+        }
+        return 0;
+out_sig:
+        send_sig(SIGXFSZ, current, 0);
+out_big:
+        return -EFBIG;
+}
+/**
 * nfs_setattr_update_inode - Update inode metadata after a setattr call.
 * @inode: pointer to struct inode
 * @attr: pointer to struct iattr
@@ -416,8 +468,7 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
        }
        if ((attr->ia_valid & ATTR_SIZE) != 0) {
                nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC);
-                inode->i_size = attr->ia_size;
+                nfs_vmtruncate(inode, attr->ia_size);
-                vmtruncate(inode, attr->ia_size);
        }
 }
@@ -647,7 +698,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
                inode->i_sb->s_id, (long long)NFS_FILEID(inode));
        nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
-        lock_kernel();
        if (is_bad_inode(inode))
                goto out_nowait;
        if (NFS_STALE(inode))
@@ -696,7 +746,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
        nfs_wake_up_inode(inode);
 out_nowait:
-        unlock_kernel();
        return status;
 }
@@ -831,9 +880,9 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                        if (S_ISDIR(inode->i_mode))
                                nfsi->cache_validity |= NFS_INO_INVALID_DATA;
                }
-                if (inode->i_size == nfs_size_to_loff_t(fattr->pre_size) &&
+                if (i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) &&
                    nfsi->npages == 0)
-                        inode->i_size = nfs_size_to_loff_t(fattr->size);
+                        i_size_write(inode, nfs_size_to_loff_t(fattr->size));
        }
 }
@@ -974,7 +1023,7 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa
                        (fattr->valid & NFS_ATTR_WCC) == 0) {
                memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime));
                memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime));
-                fattr->pre_size = inode->i_size;
+                fattr->pre_size = i_size_read(inode);
                fattr->valid |= NFS_ATTR_WCC;
        }
        return nfs_post_op_update_inode(inode, fattr);
@@ -1059,7 +1108,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                /* Do we perhaps have any outstanding writes, or has
                 * the file grown beyond our last write? */
                if (nfsi->npages == 0 || new_isize > cur_isize) {
-                        inode->i_size = new_isize;
+                        i_size_write(inode, new_isize);
                        invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
                }
                dprintk("NFS: isize change on server for file %s/%ld\n",
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 04ae867dddba..24241fcbb98d 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -150,6 +150,7 @@ extern void nfs_clear_inode(struct inode *);
 #ifdef CONFIG_NFS_V4
 extern void nfs4_clear_inode(struct inode *);
 #endif
+void nfs_zap_acl_cache(struct inode *inode);
 /* super.c */
 extern struct file_system_type nfs_xdev_fs_type;
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index 6350ecbde589..a36952810032 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -5,135 +5,41 @@
 *
 *  Copyright (C) 2005, 2006 Chuck Lever <cel@netapp.com>
 *
- *  NFS client per-mount statistics provide information about the health of
- *  the NFS client and the health of each NFS mount point.  Generally these
- *  are not for detailed problem diagnosis, but simply to indicate that there
- *  is a problem.
- *
- *  These counters are not meant to be human-readable, but are meant to be
- *  integrated into system monitoring tools such as "sar" and "iostat".  As
- *  such, the counters are sampled by the tools over time, and are never
- *  zeroed after a file system is mounted.  Moving averages can be computed
- *  by the tools by taking the difference between two instantaneous samples
- *  and dividing that by the time between the samples.
 */
 #ifndef _NFS_IOSTAT
 #define _NFS_IOSTAT
-#define NFS_IOSTAT_VERS         "1.0"
-/*
- * NFS byte counters
- *
- * 1.  SERVER - the number of payload bytes read from or written to the
- *     server by the NFS client via an NFS READ or WRITE request.
- *
- * 2.  NORMAL - the number of bytes read or written by applications via
- *     the read(2) and write(2) system call interfaces.
- *
- * 3.  DIRECT - the number of bytes read or written from files opened
- *     with the O_DIRECT flag.
- *
- * These counters give a view of the data throughput into and out of the NFS
- * client.  Comparing the number of bytes requested by an application with the
- * number of bytes the client requests from the server can provide an
- * indication of client efficiency (per-op, cache hits, etc).
- *
- * These counters can also help characterize which access methods are in
- * use.  DIRECT by itself shows whether there is any O_DIRECT traffic.
- * NORMAL + DIRECT shows how much data is going through the system call
- * interface.  A large amount of SERVER traffic without much NORMAL or
- * DIRECT traffic shows that applications are using mapped files.
- *
- * NFS page counters
- *
- * These count the number of pages read or written via nfs_readpage(),
- * nfs_readpages(), or their write equivalents.
- */
-enum nfs_stat_bytecounters {
-        NFSIOS_NORMALREADBYTES = 0,
-        NFSIOS_NORMALWRITTENBYTES,
-        NFSIOS_DIRECTREADBYTES,
-        NFSIOS_DIRECTWRITTENBYTES,
-        NFSIOS_SERVERREADBYTES,
-        NFSIOS_SERVERWRITTENBYTES,
-        NFSIOS_READPAGES,
-        NFSIOS_WRITEPAGES,
-        __NFSIOS_BYTESMAX,
-};
-/*
- * NFS event counters
- *
- * These counters provide a low-overhead way of monitoring client activity
- * without enabling NFS trace debugging.  The counters show the rate at
- * which VFS requests are made, and how often the client invalidates its
- * data and attribute caches.  This allows system administrators to monitor
- * such things as how close-to-open is working, and answer questions such
- * as "why are there so many GETATTR requests on the wire?"
- *
- * They also count anamolous events such as short reads and writes, silly
- * renames due to close-after-delete, and operations that change the size
- * of a file (such operations can often be the source of data corruption
- * if applications aren't using file locking properly).
- */
-enum nfs_stat_eventcounters {
-        NFSIOS_INODEREVALIDATE = 0,
-        NFSIOS_DENTRYREVALIDATE,
-        NFSIOS_DATAINVALIDATE,
-        NFSIOS_ATTRINVALIDATE,
-        NFSIOS_VFSOPEN,
-        NFSIOS_VFSLOOKUP,
-        NFSIOS_VFSACCESS,
-        NFSIOS_VFSUPDATEPAGE,
-        NFSIOS_VFSREADPAGE,
-        NFSIOS_VFSREADPAGES,
-        NFSIOS_VFSWRITEPAGE,
-        NFSIOS_VFSWRITEPAGES,
-        NFSIOS_VFSGETDENTS,
-        NFSIOS_VFSSETATTR,
-        NFSIOS_VFSFLUSH,
-        NFSIOS_VFSFSYNC,
-        NFSIOS_VFSLOCK,
-        NFSIOS_VFSRELEASE,
-        NFSIOS_CONGESTIONWAIT,
-        NFSIOS_SETATTRTRUNC,
-        NFSIOS_EXTENDWRITE,
-        NFSIOS_SILLYRENAME,
-        NFSIOS_SHORTREAD,
-        NFSIOS_SHORTWRITE,
-        NFSIOS_DELAY,
-        __NFSIOS_COUNTSMAX,
-};
-#ifdef __KERNEL__
 #include <linux/percpu.h>
 #include <linux/cache.h>
+#include <linux/nfs_iostat.h>
 struct nfs_iostats {
        unsigned long long      bytes[__NFSIOS_BYTESMAX];
        unsigned long           events[__NFSIOS_COUNTSMAX];
 } ____cacheline_aligned;
-static inline void nfs_inc_server_stats(struct nfs_server *server, enum nfs_stat_eventcounters stat)
+static inline void nfs_inc_server_stats(const struct nfs_server *server,
+                                        enum nfs_stat_eventcounters stat)
 {
        struct nfs_iostats *iostats;
        int cpu;
        cpu = get_cpu();
        iostats = per_cpu_ptr(server->io_stats, cpu);
-        iostats->events[stat] ++;
+        iostats->events[stat]++;
        put_cpu_no_resched();
 }
-static inline void nfs_inc_stats(struct inode *inode, enum nfs_stat_eventcounters stat)
+static inline void nfs_inc_stats(const struct inode *inode,
+                                 enum nfs_stat_eventcounters stat)
 {
        nfs_inc_server_stats(NFS_SERVER(inode), stat);
 }
-static inline void nfs_add_server_stats(struct nfs_server *server, enum nfs_stat_bytecounters stat, unsigned long addend)
+static inline void nfs_add_server_stats(const struct nfs_server *server,
+                                        enum nfs_stat_bytecounters stat,
+                                        unsigned long addend)
 {
        struct nfs_iostats *iostats;
        int cpu;
@@ -144,7 +50,9 @@ static inline void nfs_add_server_stats(struct nfs_server *server, enum nfs_stat
        put_cpu_no_resched();
 }
-static inline void nfs_add_stats(struct inode *inode, enum nfs_stat_bytecounters stat, unsigned long addend)
+static inline void nfs_add_stats(const struct inode *inode,
+                                 enum nfs_stat_bytecounters stat,
+                                 unsigned long addend)
 {
        nfs_add_server_stats(NFS_SERVER(inode), stat, addend);
 }
@@ -160,5 +68,4 @@ static inline void nfs_free_iostats(struct nfs_iostats *stats)
                free_percpu(stats);
 }
-#endif
+#endif /* _NFS_IOSTAT */
-#endif
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 9b7362565c0c..423842f51ac9 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -5,6 +5,8 @@
 #include <linux/posix_acl_xattr.h>
 #include <linux/nfsacl.h>
+#include "internal.h"
 #define NFSDBG_FACILITY NFSDBG_PROC
 ssize_t nfs3_listxattr(struct dentry *dentry, char *buffer, size_t size)
@@ -205,6 +207,8 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
        status = nfs_revalidate_inode(server, inode);
        if (status < 0)
                return ERR_PTR(status);
+        if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL)
+                nfs_zap_acl_cache(inode);
        acl = nfs3_get_cached_acl(inode, type);
        if (acl != ERR_PTR(-EAGAIN))
                return acl;
@@ -319,9 +323,8 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
        dprintk("NFS call setacl\n");
        msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL];
        status = rpc_call_sync(server->client_acl, &msg, 0);
-        spin_lock(&inode->i_lock);
+        nfs_access_zap_cache(inode);
-        NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS;
+        nfs_zap_acl_cache(inode);
-        spin_unlock(&inode->i_lock);
        dprintk("NFS reply setacl: %d\n", status);
        /* pages may have been allocated at the xdr layer. */
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index c3523ad03ed1..1e750e4574a9 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -129,6 +129,8 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
        int     status;
        dprintk("NFS call  setattr\n");
+        if (sattr->ia_valid & ATTR_FILE)
+                msg.rpc_cred = nfs_file_cred(sattr->ia_file);
        nfs_fattr_init(fattr);
        status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
        if (status == 0)
@@ -248,6 +250,53 @@ static int nfs3_proc_readlink(struct inode *inode, struct page *page,
        return status;
 }
+struct nfs3_createdata {
+        struct rpc_message msg;
+        union {
+                struct nfs3_createargs create;
+                struct nfs3_mkdirargs mkdir;
+                struct nfs3_symlinkargs symlink;
+                struct nfs3_mknodargs mknod;
+        } arg;
+        struct nfs3_diropres res;
+        struct nfs_fh fh;
+        struct nfs_fattr fattr;
+        struct nfs_fattr dir_attr;
+};
+static struct nfs3_createdata *nfs3_alloc_createdata(void)
+{
+        struct nfs3_createdata *data;
+        data = kzalloc(sizeof(*data), GFP_KERNEL);
+        if (data != NULL) {
+                data->msg.rpc_argp = &data->arg;
+                data->msg.rpc_resp = &data->res;
+                data->res.fh = &data->fh;
+                data->res.fattr = &data->fattr;
+                data->res.dir_attr = &data->dir_attr;
+                nfs_fattr_init(data->res.fattr);
+                nfs_fattr_init(data->res.dir_attr);
+        }
+        return data;
+}
+static int nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_createdata *data)
+{
+        int status;
+        status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0);
+        nfs_post_op_update_inode(dir, data->res.dir_attr);
+        if (status == 0)
+                status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+        return status;
+}
+static void nfs3_free_createdata(struct nfs3_createdata *data)
+{
+        kfree(data);
+}
 /*
 * Create a regular file.
 * For now, we don't implement O_EXCL.
@@ -256,70 +305,60 @@ static int
 nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
                 int flags, struct nameidata *nd)
 {
-        struct nfs_fh           fhandle;
+        struct nfs3_createdata *data;
-        struct nfs_fattr        fattr;
-        struct nfs_fattr        dir_attr;
-        struct nfs3_createargs  arg = {
-                .fh             = NFS_FH(dir),
-                .name           = dentry->d_name.name,
-                .len            = dentry->d_name.len,
-                .sattr          = sattr,
-        };
-        struct nfs3_diropres    res = {
-                .dir_attr       = &dir_attr,
-                .fh             = &fhandle,
-                .fattr          = &fattr
-        };
-        struct rpc_message msg = {
-                .rpc_proc       = &nfs3_procedures[NFS3PROC_CREATE],
-                .rpc_argp       = &arg,
-                .rpc_resp       = &res,
-        };
        mode_t mode = sattr->ia_mode;
-        int status;
+        int status = -ENOMEM;
        dprintk("NFS call  create %s\n", dentry->d_name.name);
-        arg.createmode = NFS3_CREATE_UNCHECKED;
+        data = nfs3_alloc_createdata();
+        if (data == NULL)
+                goto out;
+        data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_CREATE];
+        data->arg.create.fh = NFS_FH(dir);
+        data->arg.create.name = dentry->d_name.name;
+        data->arg.create.len = dentry->d_name.len;
+        data->arg.create.sattr = sattr;
+        data->arg.create.createmode = NFS3_CREATE_UNCHECKED;
        if (flags & O_EXCL) {
-                arg.createmode  = NFS3_CREATE_EXCLUSIVE;
+                data->arg.create.createmode  = NFS3_CREATE_EXCLUSIVE;
-                arg.verifier[0] = jiffies;
+                data->arg.create.verifier[0] = jiffies;
-                arg.verifier[1] = current->pid;
+                data->arg.create.verifier[1] = current->pid;
        }
        sattr->ia_mode &= ~current->fs->umask;
-again:
+        for (;;) {
-        nfs_fattr_init(&dir_attr);
+                status = nfs3_do_create(dir, dentry, data);
-        nfs_fattr_init(&fattr);
-        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
-        nfs_refresh_inode(dir, &dir_attr);
-        /* If the server doesn't support the exclusive creation semantics,
+                if (status != -ENOTSUPP)
-         * try again with simple 'guarded' mode. */
+                        break;
-        if (status == -ENOTSUPP) {
+                /* If the server doesn't support the exclusive creation
-                switch (arg.createmode) {
+                 * semantics, try again with simple 'guarded' mode. */
+                switch (data->arg.create.createmode) {
                        case NFS3_CREATE_EXCLUSIVE:
-                                arg.createmode = NFS3_CREATE_GUARDED;
+                                data->arg.create.createmode = NFS3_CREATE_GUARDED;
                                break;
                        case NFS3_CREATE_GUARDED:
-                                arg.createmode = NFS3_CREATE_UNCHECKED;
+                                data->arg.create.createmode = NFS3_CREATE_UNCHECKED;
                                break;
                        case NFS3_CREATE_UNCHECKED:
                                goto out;
                }
-                goto again;
+                nfs_fattr_init(data->res.dir_attr);
+                nfs_fattr_init(data->res.fattr);
        }
-        if (status == 0)
-                status = nfs_instantiate(dentry, &fhandle, &fattr);
        if (status != 0)
                goto out;
        /* When we created the file with exclusive semantics, make
         * sure we set the attributes afterwards. */
-        if (arg.createmode == NFS3_CREATE_EXCLUSIVE) {
+        if (data->arg.create.createmode == NFS3_CREATE_EXCLUSIVE) {
                dprintk("NFS call  setattr (post-create)\n");
                if (!(sattr->ia_valid & ATTR_ATIME_SET))
@@ -330,14 +369,15 @@ again:
                /* Note: we could use a guarded setattr here, but I'm
                 * not sure this buys us anything (and I'd have
                 * to revamp the NFSv3 XDR code) */
-                status = nfs3_proc_setattr(dentry, &fattr, sattr);
+                status = nfs3_proc_setattr(dentry, data->res.fattr, sattr);
-                nfs_post_op_update_inode(dentry->d_inode, &fattr);
+                nfs_post_op_update_inode(dentry->d_inode, data->res.fattr);
                dprintk("NFS reply setattr (post-create): %d\n", status);
+                if (status != 0)
+                        goto out;
        }
-        if (status != 0)
-                goto out;
        status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode);
 out:
+        nfs3_free_createdata(data);
        dprintk("NFS reply create: %d\n", status);
        return status;
 }
@@ -452,40 +492,28 @@ static int
 nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
                  unsigned int len, struct iattr *sattr)
 {
-        struct nfs_fh fhandle;
+        struct nfs3_createdata *data;
-        struct nfs_fattr fattr, dir_attr;
+        int status = -ENOMEM;
-        struct nfs3_symlinkargs arg = {
-                .fromfh         = NFS_FH(dir),
-                .fromname       = dentry->d_name.name,
-                .fromlen        = dentry->d_name.len,
-                .pages          = &page,
-                .pathlen        = len,
-                .sattr          = sattr
-        };
-        struct nfs3_diropres    res = {
-                .dir_attr       = &dir_attr,
-                .fh             = &fhandle,
-                .fattr          = &fattr
-        };
-        struct rpc_message msg = {
-                .rpc_proc       = &nfs3_procedures[NFS3PROC_SYMLINK],
-                .rpc_argp       = &arg,
-                .rpc_resp       = &res,
-        };
-        int                     status;
        if (len > NFS3_MAXPATHLEN)
                return -ENAMETOOLONG;
        dprintk("NFS call  symlink %s\n", dentry->d_name.name);
-        nfs_fattr_init(&dir_attr);
+        data = nfs3_alloc_createdata();
-        nfs_fattr_init(&fattr);
+        if (data == NULL)
-        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
-        nfs_post_op_update_inode(dir, &dir_attr);
-        if (status != 0)
                goto out;
-        status = nfs_instantiate(dentry, &fhandle, &fattr);
+        data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_SYMLINK];
+        data->arg.symlink.fromfh = NFS_FH(dir);
+        data->arg.symlink.fromname = dentry->d_name.name;
+        data->arg.symlink.fromlen = dentry->d_name.len;
+        data->arg.symlink.pages = &page;
+        data->arg.symlink.pathlen = len;
+        data->arg.symlink.sattr = sattr;
+        status = nfs3_do_create(dir, dentry, data);
+        nfs3_free_createdata(data);
 out:
        dprintk("NFS reply symlink: %d\n", status);
        return status;
@@ -494,42 +522,31 @@ out:
 static int
 nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
 {
-        struct nfs_fh fhandle;
+        struct nfs3_createdata *data;
-        struct nfs_fattr fattr, dir_attr;
-        struct nfs3_mkdirargs   arg = {
-                .fh             = NFS_FH(dir),
-                .name           = dentry->d_name.name,
-                .len            = dentry->d_name.len,
-                .sattr          = sattr
-        };
-        struct nfs3_diropres    res = {
-                .dir_attr       = &dir_attr,
-                .fh             = &fhandle,
-                .fattr          = &fattr
-        };
-        struct rpc_message msg = {
-                .rpc_proc       = &nfs3_procedures[NFS3PROC_MKDIR],
-                .rpc_argp       = &arg,
-                .rpc_resp       = &res,
-        };
        int mode = sattr->ia_mode;
-        int status;
+        int status = -ENOMEM;
        dprintk("NFS call  mkdir %s\n", dentry->d_name.name);
        sattr->ia_mode &= ~current->fs->umask;
-        nfs_fattr_init(&dir_attr);
+        data = nfs3_alloc_createdata();
-        nfs_fattr_init(&fattr);
+        if (data == NULL)
-        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
-        nfs_post_op_update_inode(dir, &dir_attr);
-        if (status != 0)
                goto out;
-        status = nfs_instantiate(dentry, &fhandle, &fattr);
+        data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKDIR];
+        data->arg.mkdir.fh = NFS_FH(dir);
+        data->arg.mkdir.name = dentry->d_name.name;
+        data->arg.mkdir.len = dentry->d_name.len;
+        data->arg.mkdir.sattr = sattr;
+        status = nfs3_do_create(dir, dentry, data);
        if (status != 0)
                goto out;
        status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode);
 out:
+        nfs3_free_createdata(data);
        dprintk("NFS reply mkdir: %d\n", status);
        return status;
 }
@@ -615,52 +632,50 @@ static int
 nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
                dev_t rdev)
 {
-        struct nfs_fh fh;
+        struct nfs3_createdata *data;
-        struct nfs_fattr fattr, dir_attr;
-        struct nfs3_mknodargs   arg = {
-                .fh             = NFS_FH(dir),
-                .name           = dentry->d_name.name,
-                .len            = dentry->d_name.len,
-                .sattr          = sattr,
-                .rdev           = rdev
-        };
-        struct nfs3_diropres    res = {
-                .dir_attr       = &dir_attr,
-                .fh             = &fh,
-                .fattr          = &fattr
-        };
-        struct rpc_message msg = {
-                .rpc_proc       = &nfs3_procedures[NFS3PROC_MKNOD],
-                .rpc_argp       = &arg,
-                .rpc_resp       = &res,
-        };
        mode_t mode = sattr->ia_mode;
-        int status;
+        int status = -ENOMEM;
-        switch (sattr->ia_mode & S_IFMT) {
-        case S_IFBLK:   arg.type = NF3BLK;  break;
-        case S_IFCHR:   arg.type = NF3CHR;  break;
-        case S_IFIFO:   arg.type = NF3FIFO; break;
-        case S_IFSOCK:  arg.type = NF3SOCK; break;
-        default:        return -EINVAL;
-        }
        dprintk("NFS call  mknod %s %u:%u\n", dentry->d_name.name,
                        MAJOR(rdev), MINOR(rdev));
        sattr->ia_mode &= ~current->fs->umask;
-        nfs_fattr_init(&dir_attr);
+        data = nfs3_alloc_createdata();
-        nfs_fattr_init(&fattr);
+        if (data == NULL)
-        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
-        nfs_post_op_update_inode(dir, &dir_attr);
-        if (status != 0)
                goto out;
-        status = nfs_instantiate(dentry, &fh, &fattr);
+        data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKNOD];
+        data->arg.mknod.fh = NFS_FH(dir);
+        data->arg.mknod.name = dentry->d_name.name;
+        data->arg.mknod.len = dentry->d_name.len;
+        data->arg.mknod.sattr = sattr;
+        data->arg.mknod.rdev = rdev;
+        switch (sattr->ia_mode & S_IFMT) {
+        case S_IFBLK:
+                data->arg.mknod.type = NF3BLK;
+                break;
+        case S_IFCHR:
+                data->arg.mknod.type = NF3CHR;
+                break;
+        case S_IFIFO:
+                data->arg.mknod.type = NF3FIFO;
+                break;
+        case S_IFSOCK:
+                data->arg.mknod.type = NF3SOCK;
+                break;
+        default:
+                status = -EINVAL;
+                goto out;
+        }
+        status = nfs3_do_create(dir, dentry, data);
        if (status != 0)
                goto out;
        status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode);
 out:
+        nfs3_free_createdata(data);
        dprintk("NFS reply mknod: %d\n", status);
        return status;
 }
@@ -801,8 +816,6 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
        .write_done     = nfs3_write_done,
        .commit_setup   = nfs3_proc_commit_setup,
        .commit_done    = nfs3_commit_done,
-        .file_open      = nfs_open,
-        .file_release   = nfs_release,
        .lock           = nfs3_proc_lock,
        .clear_acl_cache = nfs3_forget_cached_acls,
 };
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 1293e0acd82b..c910413eaeca 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -451,9 +451,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
                /* Save the delegation */
                memcpy(stateid.data, delegation->stateid.data, sizeof(stateid.data));
                rcu_read_unlock();
-                lock_kernel();
                ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode);
-                unlock_kernel();
                if (ret != 0)
                        goto out;
                ret = -EAGAIN;
@@ -1139,8 +1137,9 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, struct path *path, int
        return res;
 }
-static int _nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr,
+static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
-                struct iattr *sattr, struct nfs4_state *state)
+                            struct nfs_fattr *fattr, struct iattr *sattr,
+                            struct nfs4_state *state)
 {
        struct nfs_server *server = NFS_SERVER(inode);
        struct nfs_setattrargs  arg = {
@@ -1154,9 +1153,10 @@ static int _nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr,
                .server         = server,
        };
        struct rpc_message msg = {
-                .rpc_proc       = &nfs4_procedures[NFSPROC4_CLNT_SETATTR],
+                .rpc_proc       = &nfs4_procedures[NFSPROC4_CLNT_SETATTR],
-                .rpc_argp       = &arg,
+                .rpc_argp       = &arg,
-                .rpc_resp       = &res,
+                .rpc_resp       = &res,
+                .rpc_cred       = cred,
        };
        unsigned long timestamp = jiffies;
        int status;
@@ -1166,7 +1166,6 @@ static int _nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr,
        if (nfs4_copy_delegation_stateid(&arg.stateid, inode)) {
                /* Use that stateid */
        } else if (state != NULL) {
-                msg.rpc_cred = state->owner->so_cred;
                nfs4_copy_stateid(&arg.stateid, state, current->files);
        } else
                memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid));
@@ -1177,15 +1176,16 @@ static int _nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr,
        return status;
 }
-static int nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr,
+static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
-                struct iattr *sattr, struct nfs4_state *state)
+                           struct nfs_fattr *fattr, struct iattr *sattr,
+                           struct nfs4_state *state)
 {
        struct nfs_server *server = NFS_SERVER(inode);
        struct nfs4_exception exception = { };
        int err;
        do {
                err = nfs4_handle_exception(server,
-                                _nfs4_do_setattr(inode, fattr, sattr, state),
+                                _nfs4_do_setattr(inode, cred, fattr, sattr, state),
                                &exception);
        } while (exception.retry);
        return err;
@@ -1647,29 +1647,25 @@ static int
 nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
                  struct iattr *sattr)
 {
-        struct rpc_cred *cred;
        struct inode *inode = dentry->d_inode;
-        struct nfs_open_context *ctx;
+        struct rpc_cred *cred = NULL;
        struct nfs4_state *state = NULL;
        int status;
        nfs_fattr_init(fattr);
        
-        cred = rpc_lookup_cred();
-        if (IS_ERR(cred))
-                return PTR_ERR(cred);
        /* Search for an existing open(O_WRITE) file */
-        ctx = nfs_find_open_context(inode, cred, FMODE_WRITE);
+        if (sattr->ia_valid & ATTR_FILE) {
-        if (ctx != NULL)
+                struct nfs_open_context *ctx;
+                ctx = nfs_file_open_context(sattr->ia_file);
+                cred = ctx->cred;
                state = ctx->state;
+        }
-        status = nfs4_do_setattr(inode, fattr, sattr, state);
+        status = nfs4_do_setattr(inode, cred, fattr, sattr, state);
        if (status == 0)
                nfs_setattr_update_inode(inode, sattr);
-        if (ctx != NULL)
-                put_nfs_open_context(ctx);
-        put_rpccred(cred);
        return status;
 }
@@ -1897,17 +1893,16 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
                goto out;
        }
        state = nfs4_do_open(dir, &path, flags, sattr, cred);
-        put_rpccred(cred);
        d_drop(dentry);
        if (IS_ERR(state)) {
                status = PTR_ERR(state);
-                goto out;
+                goto out_putcred;
        }
        d_add(dentry, igrab(state->inode));
        nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
        if (flags & O_EXCL) {
                struct nfs_fattr fattr;
-                status = nfs4_do_setattr(state->inode, &fattr, sattr, state);
+                status = nfs4_do_setattr(state->inode, cred, &fattr, sattr, state);
                if (status == 0)
                        nfs_setattr_update_inode(state->inode, sattr);
                nfs_post_op_update_inode(state->inode, &fattr);
@@ -1916,6 +1911,8 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
                status = nfs4_intent_set_file(nd, &path, state);
        else
                nfs4_close_sync(&path, state, flags);
+out_putcred:
+        put_rpccred(cred);
 out:
        return status;
 }
@@ -2079,47 +2076,81 @@ static int nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *n
        return err;
 }
+struct nfs4_createdata {
+        struct rpc_message msg;
+        struct nfs4_create_arg arg;
+        struct nfs4_create_res res;
+        struct nfs_fh fh;
+        struct nfs_fattr fattr;
+        struct nfs_fattr dir_fattr;
+};
+static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
+                struct qstr *name, struct iattr *sattr, u32 ftype)
+{
+        struct nfs4_createdata *data;
+        data = kzalloc(sizeof(*data), GFP_KERNEL);
+        if (data != NULL) {
+                struct nfs_server *server = NFS_SERVER(dir);
+                data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE];
+                data->msg.rpc_argp = &data->arg;
+                data->msg.rpc_resp = &data->res;
+                data->arg.dir_fh = NFS_FH(dir);
+                data->arg.server = server;
+                data->arg.name = name;
+                data->arg.attrs = sattr;
+                data->arg.ftype = ftype;
+                data->arg.bitmask = server->attr_bitmask;
+                data->res.server = server;
+                data->res.fh = &data->fh;
+                data->res.fattr = &data->fattr;
+                data->res.dir_fattr = &data->dir_fattr;
+                nfs_fattr_init(data->res.fattr);
+                nfs_fattr_init(data->res.dir_fattr);
+        }
+        return data;
+}
+static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data)
+{
+        int status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0);
+        if (status == 0) {
+                update_changeattr(dir, &data->res.dir_cinfo);
+                nfs_post_op_update_inode(dir, data->res.dir_fattr);
+                status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+        }
+        return status;
+}
+static void nfs4_free_createdata(struct nfs4_createdata *data)
+{
+        kfree(data);
+}
 static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
                struct page *page, unsigned int len, struct iattr *sattr)
 {
-        struct nfs_server *server = NFS_SERVER(dir);
+        struct nfs4_createdata *data;
-        struct nfs_fh fhandle;
+        int status = -ENAMETOOLONG;
-        struct nfs_fattr fattr, dir_fattr;
-        struct nfs4_create_arg arg = {
-                .dir_fh = NFS_FH(dir),
-                .server = server,
-                .name = &dentry->d_name,
-                .attrs = sattr,
-                .ftype = NF4LNK,
-                .bitmask = server->attr_bitmask,
-        };
-        struct nfs4_create_res res = {
-                .server = server,
-                .fh = &fhandle,
-                .fattr = &fattr,
-                .dir_fattr = &dir_fattr,
-        };
-        struct rpc_message msg = {
-                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK],
-                .rpc_argp = &arg,
-                .rpc_resp = &res,
-        };
-        int                     status;
        if (len > NFS4_MAXPATHLEN)
-                return -ENAMETOOLONG;
+                goto out;
-        arg.u.symlink.pages = &page;
+        status = -ENOMEM;
-        arg.u.symlink.len = len;
+        data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4LNK);
-        nfs_fattr_init(&fattr);
+        if (data == NULL)
-        nfs_fattr_init(&dir_fattr);
+                goto out;
+        data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK];
+        data->arg.u.symlink.pages = &page;
+        data->arg.u.symlink.len = len;
        
-        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+        status = nfs4_do_create(dir, dentry, data);
-        if (!status) {
-                update_changeattr(dir, &res.dir_cinfo);
+        nfs4_free_createdata(data);
-                nfs_post_op_update_inode(dir, res.dir_fattr);
+out:
-                status = nfs_instantiate(dentry, &fhandle, &fattr);
-        }
        return status;
 }
@@ -2140,39 +2171,17 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
 static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
                struct iattr *sattr)
 {
-        struct nfs_server *server = NFS_SERVER(dir);
+        struct nfs4_createdata *data;
-        struct nfs_fh fhandle;
+        int status = -ENOMEM;
-        struct nfs_fattr fattr, dir_fattr;
-        struct nfs4_create_arg arg = {
-                .dir_fh = NFS_FH(dir),
-                .server = server,
-                .name = &dentry->d_name,
-                .attrs = sattr,
-                .ftype = NF4DIR,
-                .bitmask = server->attr_bitmask,
-        };
-        struct nfs4_create_res res = {
-                .server = server,
-                .fh = &fhandle,
-                .fattr = &fattr,
-                .dir_fattr = &dir_fattr,
-        };
-        struct rpc_message msg = {
-                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE],
-                .rpc_argp = &arg,
-                .rpc_resp = &res,
-        };
-        int                     status;
-        nfs_fattr_init(&fattr);
+        data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4DIR);
-        nfs_fattr_init(&dir_fattr);
+        if (data == NULL)
-        
+                goto out;
-        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
-        if (!status) {
+        status = nfs4_do_create(dir, dentry, data);
-                update_changeattr(dir, &res.dir_cinfo);
-                nfs_post_op_update_inode(dir, res.dir_fattr);
+        nfs4_free_createdata(data);
-                status = nfs_instantiate(dentry, &fhandle, &fattr);
+out:
-        }
        return status;
 }
@@ -2242,56 +2251,34 @@ static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
 static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
                struct iattr *sattr, dev_t rdev)
 {
-        struct nfs_server *server = NFS_SERVER(dir);
+        struct nfs4_createdata *data;
-        struct nfs_fh fh;
+        int mode = sattr->ia_mode;
-        struct nfs_fattr fattr, dir_fattr;
+        int status = -ENOMEM;
-        struct nfs4_create_arg arg = {
-                .dir_fh = NFS_FH(dir),
-                .server = server,
-                .name = &dentry->d_name,
-                .attrs = sattr,
-                .bitmask = server->attr_bitmask,
-        };
-        struct nfs4_create_res res = {
-                .server = server,
-                .fh = &fh,
-                .fattr = &fattr,
-                .dir_fattr = &dir_fattr,
-        };
-        struct rpc_message msg = {
-                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE],
-                .rpc_argp = &arg,
-                .rpc_resp = &res,
-        };
-        int                     status;
-        int                     mode = sattr->ia_mode;
-        nfs_fattr_init(&fattr);
-        nfs_fattr_init(&dir_fattr);
        BUG_ON(!(sattr->ia_valid & ATTR_MODE));
        BUG_ON(!S_ISFIFO(mode) && !S_ISBLK(mode) && !S_ISCHR(mode) && !S_ISSOCK(mode));
+        data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4SOCK);
+        if (data == NULL)
+                goto out;
        if (S_ISFIFO(mode))
-                arg.ftype = NF4FIFO;
+                data->arg.ftype = NF4FIFO;
        else if (S_ISBLK(mode)) {
-                arg.ftype = NF4BLK;
+                data->arg.ftype = NF4BLK;
-                arg.u.device.specdata1 = MAJOR(rdev);
+                data->arg.u.device.specdata1 = MAJOR(rdev);
-                arg.u.device.specdata2 = MINOR(rdev);
+                data->arg.u.device.specdata2 = MINOR(rdev);
        }
        else if (S_ISCHR(mode)) {
-                arg.ftype = NF4CHR;
+                data->arg.ftype = NF4CHR;
-                arg.u.device.specdata1 = MAJOR(rdev);
+                data->arg.u.device.specdata1 = MAJOR(rdev);
-                arg.u.device.specdata2 = MINOR(rdev);
+                data->arg.u.device.specdata2 = MINOR(rdev);
        }
-        else
-                arg.ftype = NF4SOCK;
        
-        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+        status = nfs4_do_create(dir, dentry, data);
-        if (status == 0) {
-                update_changeattr(dir, &res.dir_cinfo);
+        nfs4_free_createdata(data);
-                nfs_post_op_update_inode(dir, res.dir_fattr);
+out:
-                status = nfs_instantiate(dentry, &fh, &fattr);
-        }
        return status;
 }
@@ -2706,6 +2693,8 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen)
        ret = nfs_revalidate_inode(server, inode);
        if (ret < 0)
                return ret;
+        if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL)
+                nfs_zap_acl_cache(inode);
        ret = nfs4_read_cached_acl(inode, buf, buflen);
        if (ret != -ENOENT)
                return ret;
@@ -2733,7 +2722,8 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
        nfs_inode_return_delegation(inode);
        buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
        ret = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
-        nfs_zap_caches(inode);
+        nfs_access_zap_cache(inode);
+        nfs_zap_acl_cache(inode);
        return ret;
 }
@@ -2767,8 +2757,7 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server)
                        task->tk_status = 0;
                        return -EAGAIN;
                case -NFS4ERR_DELAY:
-                        nfs_inc_server_stats((struct nfs_server *) server,
+                        nfs_inc_server_stats(server, NFSIOS_DELAY);
-                                                NFSIOS_DELAY);
                case -NFS4ERR_GRACE:
                        rpc_delay(task, NFS4_POLL_RETRY_MAX);
                        task->tk_status = 0;
@@ -2933,7 +2922,7 @@ static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cre
 int nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred)
 {
-        long timeout;
+        long timeout = 0;
        int err;
        do {
                err = _nfs4_proc_setclientid_confirm(clp, cred);
@@ -3725,8 +3714,6 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
        .write_done     = nfs4_write_done,
        .commit_setup   = nfs4_proc_commit_setup,
        .commit_done    = nfs4_commit_done,
-        .file_open      = nfs_open,
-        .file_release   = nfs_release,
        .lock           = nfs4_proc_lock,
        .clear_acl_cache = nfs4_zap_acl_attr,
 };
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 856a8934f610..401ef8b28f97 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -940,7 +940,6 @@ static int reclaimer(void *ptr)
        allow_signal(SIGKILL);
        /* Ensure exclusive access to NFSv4 state */
-        lock_kernel();
        down_write(&clp->cl_sem);
        /* Are there any NFS mounts out there? */
        if (list_empty(&clp->cl_superblocks))
@@ -1000,7 +999,6 @@ restart_loop:
        nfs_delegation_reap_unclaimed(clp);
 out:
        up_write(&clp->cl_sem);
-        unlock_kernel();
        if (status == -NFS4ERR_CB_PATH_DOWN)
                nfs_handle_cb_pathdown(clp);
        nfs4_clear_recover_bit(clp);
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 531379d36823..46763d1cd397 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -1,6 +1,4 @@
 /*
- *  $Id: nfsroot.c,v 1.45 1998/03/07 10:44:46 mj Exp $
- *
 *  Copyright (C) 1995, 1996  Gero Kuhlmann <gero@gkminix.han.de>
 *
 *  Allow an NFS filesystem to be mounted as root. The way this works is:
@@ -297,10 +295,10 @@ static int __init root_nfs_name(char *name)
        nfs_data.flags    = NFS_MOUNT_NONLM;    /* No lockd in nfs root yet */
        nfs_data.rsize    = NFS_DEF_FILE_IO_SIZE;
        nfs_data.wsize    = NFS_DEF_FILE_IO_SIZE;
-        nfs_data.acregmin = 3;
+        nfs_data.acregmin = NFS_DEF_ACREGMIN;
-        nfs_data.acregmax = 60;
+        nfs_data.acregmax = NFS_DEF_ACREGMAX;
-        nfs_data.acdirmin = 30;
+        nfs_data.acdirmin = NFS_DEF_ACDIRMIN;
-        nfs_data.acdirmax = 60;
+        nfs_data.acdirmax = NFS_DEF_ACDIRMAX;
        strcpy(buf, NFS_ROOT);
        /* Process options received from the remote server */
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 03599bfe81cf..4dbb84df1b68 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -129,6 +129,8 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
        sattr->ia_mode &= S_IALLUGO;
        dprintk("NFS call  setattr\n");
+        if (sattr->ia_valid & ATTR_FILE)
+                msg.rpc_cred = nfs_file_cred(sattr->ia_file);
        nfs_fattr_init(fattr);
        status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
        if (status == 0)
@@ -598,6 +600,29 @@ nfs_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
        return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl);
 }
+/* Helper functions for NFS lock bounds checking */
+#define NFS_LOCK32_OFFSET_MAX ((__s32)0x7fffffffUL)
+static int nfs_lock_check_bounds(const struct file_lock *fl)
+{
+        __s32 start, end;
+        start = (__s32)fl->fl_start;
+        if ((loff_t)start != fl->fl_start)
+                goto out_einval;
+        if (fl->fl_end != OFFSET_MAX) {
+                end = (__s32)fl->fl_end;
+                if ((loff_t)end != fl->fl_end)
+                        goto out_einval;
+        } else
+                end = NFS_LOCK32_OFFSET_MAX;
+        if (start < 0 || start > end)
+                goto out_einval;
+        return 0;
+out_einval:
+        return -EINVAL;
+}
 const struct nfs_rpc_ops nfs_v2_clientops = {
        .version        = 2,                   /* protocol version */
@@ -630,7 +655,6 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
        .write_setup    = nfs_proc_write_setup,
        .write_done     = nfs_write_done,
        .commit_setup   = nfs_proc_commit_setup,
-        .file_open      = nfs_open,
-        .file_release   = nfs_release,
        .lock           = nfs_proc_lock,
+        .lock_check_bounds = nfs_lock_check_bounds,
 };
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 614efeed5437..1b94e3650f5c 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -47,6 +47,7 @@
 #include <linux/inet.h>
 #include <linux/in6.h>
 #include <net/ipv6.h>
+#include <linux/netdevice.h>
 #include <linux/nfs_xdr.h>
 #include <linux/magic.h>
 #include <linux/parser.h>
@@ -65,7 +66,6 @@
 enum {
        /* Mount options that take no arguments */
        Opt_soft, Opt_hard,
-        Opt_intr, Opt_nointr,
        Opt_posix, Opt_noposix,
        Opt_cto, Opt_nocto,
        Opt_ac, Opt_noac,
@@ -92,8 +92,8 @@ enum {
        Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost,
        Opt_addr, Opt_mountaddr, Opt_clientaddr,
-        /* Mount options that are ignored */
+        /* Special mount options */
-        Opt_userspace, Opt_deprecated,
+        Opt_userspace, Opt_deprecated, Opt_sloppy,
        Opt_err
 };
@@ -101,10 +101,14 @@ enum {
 static match_table_t nfs_mount_option_tokens = {
        { Opt_userspace, "bg" },
        { Opt_userspace, "fg" },
+        { Opt_userspace, "retry=%s" },
+        { Opt_sloppy, "sloppy" },
        { Opt_soft, "soft" },
        { Opt_hard, "hard" },
-        { Opt_intr, "intr" },
+        { Opt_deprecated, "intr" },
-        { Opt_nointr, "nointr" },
+        { Opt_deprecated, "nointr" },
        { Opt_posix, "posix" },
        { Opt_noposix, "noposix" },
        { Opt_cto, "cto" },
@@ -136,7 +140,6 @@ static match_table_t nfs_mount_option_tokens = {
        { Opt_acdirmin, "acdirmin=%u" },
        { Opt_acdirmax, "acdirmax=%u" },
        { Opt_actimeo, "actimeo=%u" },
-        { Opt_userspace, "retry=%u" },
        { Opt_namelen, "namlen=%u" },
        { Opt_mountport, "mountport=%u" },
        { Opt_mountvers, "mountvers=%u" },
@@ -207,6 +210,7 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type,
                int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
 static void nfs_kill_super(struct super_block *);
 static void nfs_put_super(struct super_block *);
+static int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
 static struct file_system_type nfs_fs_type = {
        .owner          = THIS_MODULE,
@@ -234,6 +238,7 @@ static const struct super_operations nfs_sops = {
        .umount_begin   = nfs_umount_begin,
        .show_options   = nfs_show_options,
        .show_stats     = nfs_show_stats,
+        .remount_fs     = nfs_remount,
 };
 #ifdef CONFIG_NFS_V4
@@ -278,6 +283,7 @@ static const struct super_operations nfs4_sops = {
        .umount_begin   = nfs_umount_begin,
        .show_options   = nfs_show_options,
        .show_stats     = nfs_show_stats,
+        .remount_fs     = nfs_remount,
 };
 #endif
@@ -368,8 +374,6 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        };
        int error;
-        lock_kernel();
        error = server->nfs_client->rpc_ops->statfs(server, fh, &res);
        if (error < 0)
                goto out_err;
@@ -401,12 +405,10 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_namelen = server->namelen;
-        unlock_kernel();
        return 0;
 out_err:
        dprintk("%s: statfs error = %d\n", __func__, -error);
-        unlock_kernel();
        return error;
 }
@@ -514,13 +516,13 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
        if (nfss->bsize != 0)
                seq_printf(m, ",bsize=%u", nfss->bsize);
        seq_printf(m, ",namlen=%u", nfss->namelen);
-        if (nfss->acregmin != 3*HZ || showdefaults)
+        if (nfss->acregmin != NFS_DEF_ACREGMIN*HZ || showdefaults)
                seq_printf(m, ",acregmin=%u", nfss->acregmin/HZ);
-        if (nfss->acregmax != 60*HZ || showdefaults)
+        if (nfss->acregmax != NFS_DEF_ACREGMAX*HZ || showdefaults)
                seq_printf(m, ",acregmax=%u", nfss->acregmax/HZ);
-        if (nfss->acdirmin != 30*HZ || showdefaults)
+        if (nfss->acdirmin != NFS_DEF_ACDIRMIN*HZ || showdefaults)
                seq_printf(m, ",acdirmin=%u", nfss->acdirmin/HZ);
-        if (nfss->acdirmax != 60*HZ || showdefaults)
+        if (nfss->acdirmax != NFS_DEF_ACDIRMAX*HZ || showdefaults)
                seq_printf(m, ",acdirmax=%u", nfss->acdirmax/HZ);
        for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) {
                if (nfss->flags & nfs_infop->flag)
@@ -702,49 +704,233 @@ static int nfs_verify_server_address(struct sockaddr *addr)
        return 0;
 }
+static void nfs_parse_ipv4_address(char *string, size_t str_len,
+                                   struct sockaddr *sap, size_t *addr_len)
+{
+        struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+        u8 *addr = (u8 *)&sin->sin_addr.s_addr;
+        if (str_len <= INET_ADDRSTRLEN) {
+                dfprintk(MOUNT, "NFS: parsing IPv4 address %*s\n",
+                                (int)str_len, string);
+                sin->sin_family = AF_INET;
+                *addr_len = sizeof(*sin);
+                if (in4_pton(string, str_len, addr, '\0', NULL))
+                        return;
+        }
+        sap->sa_family = AF_UNSPEC;
+        *addr_len = 0;
+}
+#define IPV6_SCOPE_DELIMITER    '%'
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static void nfs_parse_ipv6_scope_id(const char *string, const size_t str_len,
+                                    const char *delim,
+                                    struct sockaddr_in6 *sin6)
+{
+        char *p;
+        size_t len;
+        if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL))
+                return ;
+        if (*delim != IPV6_SCOPE_DELIMITER)
+                return;
+        len = (string + str_len) - delim - 1;
+        p = kstrndup(delim + 1, len, GFP_KERNEL);
+        if (p) {
+                unsigned long scope_id = 0;
+                struct net_device *dev;
+                dev = dev_get_by_name(&init_net, p);
+                if (dev != NULL) {
+                        scope_id = dev->ifindex;
+                        dev_put(dev);
+                } else {
+                        /* scope_id is set to zero on error */
+                        strict_strtoul(p, 10, &scope_id);
+                }
+                kfree(p);
+                sin6->sin6_scope_id = scope_id;
+                dfprintk(MOUNT, "NFS: IPv6 scope ID = %lu\n", scope_id);
+        }
+}
+static void nfs_parse_ipv6_address(char *string, size_t str_len,
+                                   struct sockaddr *sap, size_t *addr_len)
+{
+        struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+        u8 *addr = (u8 *)&sin6->sin6_addr.in6_u;
+        const char *delim;
+        if (str_len <= INET6_ADDRSTRLEN) {
+                dfprintk(MOUNT, "NFS: parsing IPv6 address %*s\n",
+                                (int)str_len, string);
+                sin6->sin6_family = AF_INET6;
+                *addr_len = sizeof(*sin6);
+                if (in6_pton(string, str_len, addr, IPV6_SCOPE_DELIMITER, &delim)) {
+                        nfs_parse_ipv6_scope_id(string, str_len, delim, sin6);
+                        return;
+                }
+        }
+        sap->sa_family = AF_UNSPEC;
+        *addr_len = 0;
+}
+#else
+static void nfs_parse_ipv6_address(char *string, size_t str_len,
+                                   struct sockaddr *sap, size_t *addr_len)
+{
+        sap->sa_family = AF_UNSPEC;
+        *addr_len = 0;
+}
+#endif
 /*
- * Parse string addresses passed in via a mount option,
+ * Construct a sockaddr based on the contents of a string that contains
- * and construct a sockaddr based on the result.
+ * an IP address in presentation format.
 *
- * If address parsing fails, set the sockaddr's address
+ * If there is a problem constructing the new sockaddr, set the address
- * family to AF_UNSPEC to force nfs_verify_server_address()
+ * family to AF_UNSPEC.
- * to punt the mount.
 */
-static void nfs_parse_server_address(char *value,
+static void nfs_parse_ip_address(char *string, size_t str_len,
-                                     struct sockaddr *sap,
+                                 struct sockaddr *sap, size_t *addr_len)
-                                     size_t *len)
 {
-        if (strchr(value, ':')) {
+        unsigned int i, colons;
-                struct sockaddr_in6 *ap = (struct sockaddr_in6 *)sap;
-                u8 *addr = (u8 *)&ap->sin6_addr.in6_u;
-                ap->sin6_family = AF_INET6;
+        colons = 0;
-                *len = sizeof(*ap);
+        for (i = 0; i < str_len; i++)
-                if (in6_pton(value, -1, addr, '\0', NULL))
+                if (string[i] == ':')
-                        return;
+                        colons++;
-        } else {
-                struct sockaddr_in *ap = (struct sockaddr_in *)sap;
+        if (colons >= 2)
-                u8 *addr = (u8 *)&ap->sin_addr.s_addr;
+                nfs_parse_ipv6_address(string, str_len, sap, addr_len);
+        else
+                nfs_parse_ipv4_address(string, str_len, sap, addr_len);
+}
+/*
+ * Sanity check the NFS transport protocol.
+ *
+ */
+static void nfs_validate_transport_protocol(struct nfs_parsed_mount_data *mnt)
+{
+        switch (mnt->nfs_server.protocol) {
+        case XPRT_TRANSPORT_UDP:
+        case XPRT_TRANSPORT_TCP:
+        case XPRT_TRANSPORT_RDMA:
+                break;
+        default:
+                mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
+        }
+}
+/*
+ * For text based NFSv2/v3 mounts, the mount protocol transport default
+ * settings should depend upon the specified NFS transport.
+ */
+static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt)
+{
+        nfs_validate_transport_protocol(mnt);
-                ap->sin_family = AF_INET;
+        if (mnt->mount_server.protocol == XPRT_TRANSPORT_UDP ||
-                *len = sizeof(*ap);
+            mnt->mount_server.protocol == XPRT_TRANSPORT_TCP)
-                if (in4_pton(value, -1, addr, '\0', NULL))
                        return;
+        switch (mnt->nfs_server.protocol) {
+        case XPRT_TRANSPORT_UDP:
+                mnt->mount_server.protocol = XPRT_TRANSPORT_UDP;
+                break;
+        case XPRT_TRANSPORT_TCP:
+        case XPRT_TRANSPORT_RDMA:
+                mnt->mount_server.protocol = XPRT_TRANSPORT_TCP;
        }
+}
-        sap->sa_family = AF_UNSPEC;
+/*
-        *len = 0;
+ * Parse the value of the 'sec=' option.
+ *
+ * The flavor_len setting is for v4 mounts.
+ */
+static int nfs_parse_security_flavors(char *value,
+                                      struct nfs_parsed_mount_data *mnt)
+{
+        substring_t args[MAX_OPT_ARGS];
+        dfprintk(MOUNT, "NFS: parsing sec=%s option\n", value);
+        switch (match_token(value, nfs_secflavor_tokens, args)) {
+        case Opt_sec_none:
+                mnt->auth_flavor_len = 0;
+                mnt->auth_flavors[0] = RPC_AUTH_NULL;
+                break;
+        case Opt_sec_sys:
+                mnt->auth_flavor_len = 0;
+                mnt->auth_flavors[0] = RPC_AUTH_UNIX;
+                break;
+        case Opt_sec_krb5:
+                mnt->auth_flavor_len = 1;
+                mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5;
+                break;
+        case Opt_sec_krb5i:
+                mnt->auth_flavor_len = 1;
+                mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I;
+                break;
+        case Opt_sec_krb5p:
+                mnt->auth_flavor_len = 1;
+                mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P;
+                break;
+        case Opt_sec_lkey:
+                mnt->auth_flavor_len = 1;
+                mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY;
+                break;
+        case Opt_sec_lkeyi:
+                mnt->auth_flavor_len = 1;
+                mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI;
+                break;
+        case Opt_sec_lkeyp:
+                mnt->auth_flavor_len = 1;
+                mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP;
+                break;
+        case Opt_sec_spkm:
+                mnt->auth_flavor_len = 1;
+                mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM;
+                break;
+        case Opt_sec_spkmi:
+                mnt->auth_flavor_len = 1;
+                mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI;
+                break;
+        case Opt_sec_spkmp:
+                mnt->auth_flavor_len = 1;
+                mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP;
+                break;
+        default:
+                return 0;
+        }
+        return 1;
+}
+static void nfs_parse_invalid_value(const char *option)
+{
+        dfprintk(MOUNT, "NFS:   bad value specified for %s option\n", option);
 }
 /*
 * Error-check and convert a string of mount options from user space into
- * a data structure
+ * a data structure.  The whole mount string is processed; bad options are
+ * skipped as they are encountered.  If there were no errors, return 1;
+ * otherwise return 0 (zero).
 */
 static int nfs_parse_mount_options(char *raw,
                                   struct nfs_parsed_mount_data *mnt)
 {
        char *p, *string, *secdata;
-        int rc;
+        int rc, sloppy = 0, errors = 0;
        if (!raw) {
                dfprintk(MOUNT, "NFS: mount options string was NULL.\n");
@@ -777,15 +963,16 @@ static int nfs_parse_mount_options(char *raw,
                token = match_token(p, nfs_mount_option_tokens, args);
                switch (token) {
+                /*
+                 * boolean options:  foo/nofoo
+                 */
                case Opt_soft:
                        mnt->flags |= NFS_MOUNT_SOFT;
                        break;
                case Opt_hard:
                        mnt->flags &= ~NFS_MOUNT_SOFT;
                        break;
-                case Opt_intr:
-                case Opt_nointr:
-                        break;
                case Opt_posix:
                        mnt->flags |= NFS_MOUNT_POSIX;
                        break;
@@ -819,20 +1006,14 @@ static int nfs_parse_mount_options(char *raw,
                case Opt_udp:
                        mnt->flags &= ~NFS_MOUNT_TCP;
                        mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
-                        mnt->timeo = 7;
-                        mnt->retrans = 5;
                        break;
                case Opt_tcp:
                        mnt->flags |= NFS_MOUNT_TCP;
                        mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
-                        mnt->timeo = 600;
-                        mnt->retrans = 2;
                        break;
                case Opt_rdma:
                        mnt->flags |= NFS_MOUNT_TCP; /* for side protocols */
                        mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
-                        mnt->timeo = 600;
-                        mnt->retrans = 2;
                        break;
                case Opt_acl:
                        mnt->flags &= ~NFS_MOUNT_NOACL;
@@ -853,165 +1034,144 @@ static int nfs_parse_mount_options(char *raw,
                        mnt->flags |= NFS_MOUNT_UNSHARED;
                        break;
+                /*
+                 * options that take numeric values
+                 */
                case Opt_port:
-                        if (match_int(args, &option))
+                        if (match_int(args, &option) ||
-                                return 0;
+                            option < 0 || option > USHORT_MAX) {
-                        if (option < 0 || option > 65535)
+                                errors++;
-                                return 0;
+                                nfs_parse_invalid_value("port");
-                        mnt->nfs_server.port = option;
+                        } else
+                                mnt->nfs_server.port = option;
                        break;
                case Opt_rsize:
-                        if (match_int(args, &mnt->rsize))
+                        if (match_int(args, &option) || option < 0) {
-                                return 0;
+                                errors++;
+                                nfs_parse_invalid_value("rsize");
+                        } else
+                                mnt->rsize = option;
                        break;
                case Opt_wsize:
-                        if (match_int(args, &mnt->wsize))
+                        if (match_int(args, &option) || option < 0) {
-                                return 0;
+                                errors++;
+                                nfs_parse_invalid_value("wsize");
+                        } else
+                                mnt->wsize = option;
                        break;
                case Opt_bsize:
-                        if (match_int(args, &option))
+                        if (match_int(args, &option) || option < 0) {
-                                return 0;
+                                errors++;
-                        if (option < 0)
+                                nfs_parse_invalid_value("bsize");
-                                return 0;
+                        } else
-                        mnt->bsize = option;
+                                mnt->bsize = option;
                        break;
                case Opt_timeo:
-                        if (match_int(args, &mnt->timeo))
+                        if (match_int(args, &option) || option <= 0) {
-                                return 0;
+                                errors++;
+                                nfs_parse_invalid_value("timeo");
+                        } else
+                                mnt->timeo = option;
                        break;
                case Opt_retrans:
-                        if (match_int(args, &mnt->retrans))
+                        if (match_int(args, &option) || option <= 0) {
-                                return 0;
+                                errors++;
+                                nfs_parse_invalid_value("retrans");
+                        } else
+                                mnt->retrans = option;
                        break;
                case Opt_acregmin:
-                        if (match_int(args, &mnt->acregmin))
+                        if (match_int(args, &option) || option < 0) {
-                                return 0;
+                                errors++;
+                                nfs_parse_invalid_value("acregmin");
+                        } else
+                                mnt->acregmin = option;
                        break;
                case Opt_acregmax:
-                        if (match_int(args, &mnt->acregmax))
+                        if (match_int(args, &option) || option < 0) {
-                                return 0;
+                                errors++;
+                                nfs_parse_invalid_value("acregmax");
+                        } else
+                                mnt->acregmax = option;
                        break;
                case Opt_acdirmin:
-                        if (match_int(args, &mnt->acdirmin))
+                        if (match_int(args, &option) || option < 0) {
-                                return 0;
+                                errors++;
+                                nfs_parse_invalid_value("acdirmin");
+                        } else
+                                mnt->acdirmin = option;
                        break;
                case Opt_acdirmax:
-                        if (match_int(args, &mnt->acdirmax))
+                        if (match_int(args, &option) || option < 0) {
-                                return 0;
+                                errors++;
+                                nfs_parse_invalid_value("acdirmax");
+                        } else
+                                mnt->acdirmax = option;
                        break;
                case Opt_actimeo:
-                        if (match_int(args, &option))
+                        if (match_int(args, &option) || option < 0) {
-                                return 0;
+                                errors++;
-                        if (option < 0)
+                                nfs_parse_invalid_value("actimeo");
-                                return 0;
+                        } else
-                        mnt->acregmin =
+                                mnt->acregmin = mnt->acregmax =
-                        mnt->acregmax =
+                                mnt->acdirmin = mnt->acdirmax = option;
-                        mnt->acdirmin =
-                        mnt->acdirmax = option;
                        break;
                case Opt_namelen:
-                        if (match_int(args, &mnt->namlen))
+                        if (match_int(args, &option) || option < 0) {
-                                return 0;
+                                errors++;
+                                nfs_parse_invalid_value("namlen");
+                        } else
+                                mnt->namlen = option;
                        break;
                case Opt_mountport:
-                        if (match_int(args, &option))
+                        if (match_int(args, &option) ||
-                                return 0;
+                            option < 0 || option > USHORT_MAX) {
-                        if (option < 0 || option > 65535)
+                                errors++;
-                                return 0;
+                                nfs_parse_invalid_value("mountport");
-                        mnt->mount_server.port = option;
+                        } else
+                                mnt->mount_server.port = option;
                        break;
                case Opt_mountvers:
-                        if (match_int(args, &option))
+                        if (match_int(args, &option) ||
-                                return 0;
+                            option < NFS_MNT_VERSION ||
-                        if (option < 0)
+                            option > NFS_MNT3_VERSION) {
-                                return 0;
+                                errors++;
-                        mnt->mount_server.version = option;
+                                nfs_parse_invalid_value("mountvers");
+                        } else
+                                mnt->mount_server.version = option;
                        break;
                case Opt_nfsvers:
-                        if (match_int(args, &option))
+                        if (match_int(args, &option)) {
-                                return 0;
+                                errors++;
+                                nfs_parse_invalid_value("nfsvers");
+                                break;
+                        }
                        switch (option) {
-                        case 2:
+                        case NFS2_VERSION:
                                mnt->flags &= ~NFS_MOUNT_VER3;
                                break;
-                        case 3:
+                        case NFS3_VERSION:
                                mnt->flags |= NFS_MOUNT_VER3;
                                break;
                        default:
-                                goto out_unrec_vers;
+                                errors++;
+                                nfs_parse_invalid_value("nfsvers");
                        }
                        break;
+                /*
+                 * options that take text values
+                 */
                case Opt_sec:
                        string = match_strdup(args);
                        if (string == NULL)
                                goto out_nomem;
-                        token = match_token(string, nfs_secflavor_tokens, args);
+                        rc = nfs_parse_security_flavors(string, mnt);
                        kfree(string);
+                        if (!rc) {
-                        /*
+                                errors++;
-                         * The flags setting is for v2/v3.  The flavor_len
+                                dfprintk(MOUNT, "NFS:   unrecognized "
-                         * setting is for v4.  v2/v3 also need to know the
+                                                "security flavor\n");
-                         * difference between NULL and UNIX.
-                         */
-                        switch (token) {
-                        case Opt_sec_none:
-                                mnt->flags &= ~NFS_MOUNT_SECFLAVOUR;
-                                mnt->auth_flavor_len = 0;
-                                mnt->auth_flavors[0] = RPC_AUTH_NULL;
-                                break;
-                        case Opt_sec_sys:
-                                mnt->flags &= ~NFS_MOUNT_SECFLAVOUR;
-                                mnt->auth_flavor_len = 0;
-                                mnt->auth_flavors[0] = RPC_AUTH_UNIX;
-                                break;
-                        case Opt_sec_krb5:
-                                mnt->flags |= NFS_MOUNT_SECFLAVOUR;
-                                mnt->auth_flavor_len = 1;
-                                mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5;
-                                break;
-                        case Opt_sec_krb5i:
-                                mnt->flags |= NFS_MOUNT_SECFLAVOUR;
-                                mnt->auth_flavor_len = 1;
-                                mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I;
-                                break;
-                        case Opt_sec_krb5p:
-                                mnt->flags |= NFS_MOUNT_SECFLAVOUR;
-                                mnt->auth_flavor_len = 1;
-                                mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P;
-                                break;
-                        case Opt_sec_lkey:
-                                mnt->flags |= NFS_MOUNT_SECFLAVOUR;
-                                mnt->auth_flavor_len = 1;
-                                mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY;
-                                break;
-                        case Opt_sec_lkeyi:
-                                mnt->flags |= NFS_MOUNT_SECFLAVOUR;
-                                mnt->auth_flavor_len = 1;
-                                mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI;
-                                break;
-                        case Opt_sec_lkeyp:
-                                mnt->flags |= NFS_MOUNT_SECFLAVOUR;
-                                mnt->auth_flavor_len = 1;
-                                mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP;
-                                break;
-                        case Opt_sec_spkm:
-                                mnt->flags |= NFS_MOUNT_SECFLAVOUR;
-                                mnt->auth_flavor_len = 1;
-                                mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM;
-                                break;
-                        case Opt_sec_spkmi:
-                                mnt->flags |= NFS_MOUNT_SECFLAVOUR;
-                                mnt->auth_flavor_len = 1;
-                                mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI;
-                                break;
-                        case Opt_sec_spkmp:
-                                mnt->flags |= NFS_MOUNT_SECFLAVOUR;
-                                mnt->auth_flavor_len = 1;
-                                mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP;
-                                break;
-                        default:
-                                goto out_unrec_sec;
                        }
                        break;
                case Opt_proto:
@@ -1026,24 +1186,20 @@ static int nfs_parse_mount_options(char *raw,
                        case Opt_xprt_udp:
                                mnt->flags &= ~NFS_MOUNT_TCP;
                                mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
-                                mnt->timeo = 7;
-                                mnt->retrans = 5;
                                break;
                        case Opt_xprt_tcp:
                                mnt->flags |= NFS_MOUNT_TCP;
                                mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
-                                mnt->timeo = 600;
-                                mnt->retrans = 2;
                                break;
                        case Opt_xprt_rdma:
                                /* vector side protocols to TCP */
                                mnt->flags |= NFS_MOUNT_TCP;
                                mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
-                                mnt->timeo = 600;
-                                mnt->retrans = 2;
                                break;
                        default:
-                                goto out_unrec_xprt;
+                                errors++;
+                                dfprintk(MOUNT, "NFS:   unrecognized "
+                                                "transport protocol\n");
                        }
                        break;
                case Opt_mountproto:
@@ -1063,16 +1219,19 @@ static int nfs_parse_mount_options(char *raw,
                                break;
                        case Opt_xprt_rdma: /* not used for side protocols */
                        default:
-                                goto out_unrec_xprt;
+                                errors++;
+                                dfprintk(MOUNT, "NFS:   unrecognized "
+                                                "transport protocol\n");
                        }
                        break;
                case Opt_addr:
                        string = match_strdup(args);
                        if (string == NULL)
                                goto out_nomem;
-                        nfs_parse_server_address(string, (struct sockaddr *)
+                        nfs_parse_ip_address(string, strlen(string),
-                                                 &mnt->nfs_server.address,
+                                             (struct sockaddr *)
-                                                 &mnt->nfs_server.addrlen);
+                                                &mnt->nfs_server.address,
+                                             &mnt->nfs_server.addrlen);
                        kfree(string);
                        break;
                case Opt_clientaddr:
@@ -1093,24 +1252,33 @@ static int nfs_parse_mount_options(char *raw,
                        string = match_strdup(args);
                        if (string == NULL)
                                goto out_nomem;
-                        nfs_parse_server_address(string, (struct sockaddr *)
+                        nfs_parse_ip_address(string, strlen(string),
-                                                 &mnt->mount_server.address,
+                                             (struct sockaddr *)
-                                                 &mnt->mount_server.addrlen);
+                                                &mnt->mount_server.address,
+                                             &mnt->mount_server.addrlen);
                        kfree(string);
                        break;
+                /*
+                 * Special options
+                 */
+                case Opt_sloppy:
+                        sloppy = 1;
+                        dfprintk(MOUNT, "NFS:   relaxing parsing rules\n");
+                        break;
                case Opt_userspace:
                case Opt_deprecated:
+                        dfprintk(MOUNT, "NFS:   ignoring mount option "
+                                        "'%s'\n", p);
                        break;
                default:
-                        goto out_unknown;
+                        errors++;
+                        dfprintk(MOUNT, "NFS:   unrecognized mount option "
+                                        "'%s'\n", p);
                }
        }
-        nfs_set_port((struct sockaddr *)&mnt->nfs_server.address,
-                                mnt->nfs_server.port);
        return 1;
 out_nomem:
@@ -1120,21 +1288,6 @@ out_security_failure:
        free_secdata(secdata);
        printk(KERN_INFO "NFS: security options invalid: %d\n", rc);
        return 0;
-out_unrec_vers:
-        printk(KERN_INFO "NFS: unrecognized NFS version number\n");
-        return 0;
-out_unrec_xprt:
-        printk(KERN_INFO "NFS: unrecognized transport protocol\n");
-        return 0;
-out_unrec_sec:
-        printk(KERN_INFO "NFS: unrecognized security flavor\n");
-        return 0;
-out_unknown:
-        printk(KERN_INFO "NFS: unknown mount option: %s\n", p);
-        return 0;
 }
 /*
@@ -1188,11 +1341,146 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
        if (status == 0)
                return 0;
-        dfprintk(MOUNT, "NFS: unable to mount server %s, error %d",
+        dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n",
                        hostname, status);
        return status;
 }
+static int nfs_parse_simple_hostname(const char *dev_name,
+                                     char **hostname, size_t maxnamlen,
+                                     char **export_path, size_t maxpathlen)
+{
+        size_t len;
+        char *colon, *comma;
+        colon = strchr(dev_name, ':');
+        if (colon == NULL)
+                goto out_bad_devname;
+        len = colon - dev_name;
+        if (len > maxnamlen)
+                goto out_hostname;
+        /* N.B. caller will free nfs_server.hostname in all cases */
+        *hostname = kstrndup(dev_name, len, GFP_KERNEL);
+        if (!*hostname)
+                goto out_nomem;
+        /* kill possible hostname list: not supported */
+        comma = strchr(*hostname, ',');
+        if (comma != NULL) {
+                if (comma == *hostname)
+                        goto out_bad_devname;
+                *comma = '\0';
+        }
+        colon++;
+        len = strlen(colon);
+        if (len > maxpathlen)
+                goto out_path;
+        *export_path = kstrndup(colon, len, GFP_KERNEL);
+        if (!*export_path)
+                goto out_nomem;
+        dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *export_path);
+        return 0;
+out_bad_devname:
+        dfprintk(MOUNT, "NFS: device name not in host:path format\n");
+        return -EINVAL;
+out_nomem:
+        dfprintk(MOUNT, "NFS: not enough memory to parse device name\n");
+        return -ENOMEM;
+out_hostname:
+        dfprintk(MOUNT, "NFS: server hostname too long\n");
+        return -ENAMETOOLONG;
+out_path:
+        dfprintk(MOUNT, "NFS: export pathname too long\n");
+        return -ENAMETOOLONG;
+}
+/*
+ * Hostname has square brackets around it because it contains one or
+ * more colons.  We look for the first closing square bracket, and a
+ * colon must follow it.
+ */
+static int nfs_parse_protected_hostname(const char *dev_name,
+                                        char **hostname, size_t maxnamlen,
+                                        char **export_path, size_t maxpathlen)
+{
+        size_t len;
+        char *start, *end;
+        start = (char *)(dev_name + 1);
+        end = strchr(start, ']');
+        if (end == NULL)
+                goto out_bad_devname;
+        if (*(end + 1) != ':')
+                goto out_bad_devname;
+        len = end - start;
+        if (len > maxnamlen)
+                goto out_hostname;
+        /* N.B. caller will free nfs_server.hostname in all cases */
+        *hostname = kstrndup(start, len, GFP_KERNEL);
+        if (*hostname == NULL)
+                goto out_nomem;
+        end += 2;
+        len = strlen(end);
+        if (len > maxpathlen)
+                goto out_path;
+        *export_path = kstrndup(end, len, GFP_KERNEL);
+        if (!*export_path)
+                goto out_nomem;
+        return 0;
+out_bad_devname:
+        dfprintk(MOUNT, "NFS: device name not in host:path format\n");
+        return -EINVAL;
+out_nomem:
+        dfprintk(MOUNT, "NFS: not enough memory to parse device name\n");
+        return -ENOMEM;
+out_hostname:
+        dfprintk(MOUNT, "NFS: server hostname too long\n");
+        return -ENAMETOOLONG;
+out_path:
+        dfprintk(MOUNT, "NFS: export pathname too long\n");
+        return -ENAMETOOLONG;
+}
+/*
+ * Split "dev_name" into "hostname:export_path".
+ *
+ * The leftmost colon demarks the split between the server's hostname
+ * and the export path.  If the hostname starts with a left square
+ * bracket, then it may contain colons.
+ *
+ * Note: caller frees hostname and export path, even on error.
+ */
+static int nfs_parse_devname(const char *dev_name,
+                             char **hostname, size_t maxnamlen,
+                             char **export_path, size_t maxpathlen)
+{
+        if (*dev_name == '[')
+                return nfs_parse_protected_hostname(dev_name,
+                                                    hostname, maxnamlen,
+                                                    export_path, maxpathlen);
+        return nfs_parse_simple_hostname(dev_name,
+                                         hostname, maxnamlen,
+                                         export_path, maxpathlen);
+}
 /*
 * Validate the NFS2/NFS3 mount data
 * - fills in the mount root filehandle
@@ -1222,16 +1510,14 @@ static int nfs_validate_mount_data(void *options,
        args->flags             = (NFS_MOUNT_VER3 | NFS_MOUNT_TCP);
        args->rsize             = NFS_MAX_FILE_IO_SIZE;
        args->wsize             = NFS_MAX_FILE_IO_SIZE;
-        args->timeo             = 600;
+        args->acregmin          = NFS_DEF_ACREGMIN;
-        args->retrans           = 2;
+        args->acregmax          = NFS_DEF_ACREGMAX;
-        args->acregmin          = 3;
+        args->acdirmin          = NFS_DEF_ACDIRMIN;
-        args->acregmax          = 60;
+        args->acdirmax          = NFS_DEF_ACDIRMAX;
-        args->acdirmin          = 30;
-        args->acdirmax          = 60;
        args->mount_server.port = 0;    /* autobind unless user sets port */
-        args->mount_server.protocol = XPRT_TRANSPORT_UDP;
        args->nfs_server.port   = 0;    /* autobind unless user sets port */
        args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
+        args->auth_flavors[0]   = RPC_AUTH_UNIX;
        switch (data->version) {
        case 1:
@@ -1289,7 +1575,9 @@ static int nfs_validate_mount_data(void *options,
                args->nfs_server.hostname = kstrdup(data->hostname, GFP_KERNEL);
                args->namlen            = data->namlen;
                args->bsize             = data->bsize;
-                args->auth_flavors[0]   = data->pseudoflavor;
+                if (data->flags & NFS_MOUNT_SECFLAVOUR)
+                        args->auth_flavors[0] = data->pseudoflavor;
                if (!args->nfs_server.hostname)
                        goto out_nomem;
@@ -1321,8 +1609,6 @@ static int nfs_validate_mount_data(void *options,
                break;
        default: {
-                unsigned int len;
-                char *c;
                int status;
                if (nfs_parse_mount_options((char *)options, args) == 0)
@@ -1332,21 +1618,22 @@ static int nfs_validate_mount_data(void *options,
                                                &args->nfs_server.address))
                        goto out_no_address;
-                c = strchr(dev_name, ':');
+                nfs_set_port((struct sockaddr *)&args->nfs_server.address,
-                if (c == NULL)
+                                args->nfs_server.port);
-                        return -EINVAL;
-                len = c - dev_name;
-                /* N.B. caller will free nfs_server.hostname in all cases */
-                args->nfs_server.hostname = kstrndup(dev_name, len, GFP_KERNEL);
-                if (!args->nfs_server.hostname)
-                        goto out_nomem;
-                c++;
+                nfs_set_mount_transport_protocol(args);
-                if (strlen(c) > NFS_MAXPATHLEN)
-                        return -ENAMETOOLONG;
+                status = nfs_parse_devname(dev_name,
-                args->nfs_server.export_path = c;
+                                           &args->nfs_server.hostname,
+                                           PAGE_SIZE,
+                                           &args->nfs_server.export_path,
+                                           NFS_MAXPATHLEN);
+                if (!status)
+                        status = nfs_try_mount(args, mntfh);
+                kfree(args->nfs_server.export_path);
+                args->nfs_server.export_path = NULL;
-                status = nfs_try_mount(args, mntfh);
                if (status)
                        return status;
@@ -1354,9 +1641,6 @@ static int nfs_validate_mount_data(void *options,
                }
        }
-        if (!(args->flags & NFS_MOUNT_SECFLAVOUR))
-                args->auth_flavors[0] = RPC_AUTH_UNIX;
 #ifndef CONFIG_NFS_V3
        if (args->flags & NFS_MOUNT_VER3)
                goto out_v3_not_compiled;
@@ -1396,6 +1680,80 @@ out_invalid_fh:
        return -EINVAL;
 }
+static int
+nfs_compare_remount_data(struct nfs_server *nfss,
+                         struct nfs_parsed_mount_data *data)
+{
+        if (data->flags != nfss->flags ||
+            data->rsize != nfss->rsize ||
+            data->wsize != nfss->wsize ||
+            data->retrans != nfss->client->cl_timeout->to_retries ||
+            data->auth_flavors[0] != nfss->client->cl_auth->au_flavor ||
+            data->acregmin != nfss->acregmin / HZ ||
+            data->acregmax != nfss->acregmax / HZ ||
+            data->acdirmin != nfss->acdirmin / HZ ||
+            data->acdirmax != nfss->acdirmax / HZ ||
+            data->timeo != (10U * nfss->client->cl_timeout->to_initval / HZ) ||
+            data->nfs_server.addrlen != nfss->nfs_client->cl_addrlen ||
+            memcmp(&data->nfs_server.address, &nfss->nfs_client->cl_addr,
+                   data->nfs_server.addrlen) != 0)
+                return -EINVAL;
+        return 0;
+}
+static int
+nfs_remount(struct super_block *sb, int *flags, char *raw_data)
+{
+        int error;
+        struct nfs_server *nfss = sb->s_fs_info;
+        struct nfs_parsed_mount_data *data;
+        struct nfs_mount_data *options = (struct nfs_mount_data *)raw_data;
+        struct nfs4_mount_data *options4 = (struct nfs4_mount_data *)raw_data;
+        u32 nfsvers = nfss->nfs_client->rpc_ops->version;
+        /*
+         * Userspace mount programs that send binary options generally send
+         * them populated with default values. We have no way to know which
+         * ones were explicitly specified. Fall back to legacy behavior and
+         * just return success.
+         */
+        if ((nfsvers == 4 && options4->version == 1) ||
+            (nfsvers <= 3 && options->version >= 1 &&
+             options->version <= 6))
+                return 0;
+        data = kzalloc(sizeof(*data), GFP_KERNEL);
+        if (data == NULL)
+                return -ENOMEM;
+        /* fill out struct with values from existing mount */
+        data->flags = nfss->flags;
+        data->rsize = nfss->rsize;
+        data->wsize = nfss->wsize;
+        data->retrans = nfss->client->cl_timeout->to_retries;
+        data->auth_flavors[0] = nfss->client->cl_auth->au_flavor;
+        data->acregmin = nfss->acregmin / HZ;
+        data->acregmax = nfss->acregmax / HZ;
+        data->acdirmin = nfss->acdirmin / HZ;
+        data->acdirmax = nfss->acdirmax / HZ;
+        data->timeo = 10U * nfss->client->cl_timeout->to_initval / HZ;
+        data->nfs_server.addrlen = nfss->nfs_client->cl_addrlen;
+        memcpy(&data->nfs_server.address, &nfss->nfs_client->cl_addr,
+                data->nfs_server.addrlen);
+        /* overwrite those values with any that were specified */
+        error = nfs_parse_mount_options((char *)options, data);
+        if (error < 0)
+                goto out;
+        /* compare new mount options with old ones */
+        error = nfs_compare_remount_data(nfss, data);
+out:
+        kfree(data);
+        return error;
+}
 /*
 * Initialise the common bits of the superblock
 */
@@ -1811,14 +2169,13 @@ static int nfs4_validate_mount_data(void *options,
        args->rsize             = NFS_MAX_FILE_IO_SIZE;
        args->wsize             = NFS_MAX_FILE_IO_SIZE;
-        args->timeo             = 600;
+        args->acregmin          = NFS_DEF_ACREGMIN;
-        args->retrans           = 2;
+        args->acregmax          = NFS_DEF_ACREGMAX;
-        args->acregmin          = 3;
+        args->acdirmin          = NFS_DEF_ACDIRMIN;
-        args->acregmax          = 60;
+        args->acdirmax          = NFS_DEF_ACDIRMAX;
-        args->acdirmin          = 30;
-        args->acdirmax          = 60;
        args->nfs_server.port   = NFS_PORT; /* 2049 unless user set port= */
-        args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
+        args->auth_flavors[0]   = RPC_AUTH_UNIX;
+        args->auth_flavor_len   = 0;
        switch (data->version) {
        case 1:
@@ -1834,18 +2191,13 @@ static int nfs4_validate_mount_data(void *options,
                                                &args->nfs_server.address))
                        goto out_no_address;
-                switch (data->auth_flavourlen) {
+                if (data->auth_flavourlen) {
-                case 0:
+                        if (data->auth_flavourlen > 1)
-                        args->auth_flavors[0] = RPC_AUTH_UNIX;
+                                goto out_inval_auth;
-                        break;
-                case 1:
                        if (copy_from_user(&args->auth_flavors[0],
                                           data->auth_flavours,
                                           sizeof(args->auth_flavors[0])))
                                return -EFAULT;
-                        break;
-                default:
-                        goto out_inval_auth;
                }
                c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN);
@@ -1879,10 +2231,11 @@ static int nfs4_validate_mount_data(void *options,
                args->acdirmin  = data->acdirmin;
                args->acdirmax  = data->acdirmax;
                args->nfs_server.protocol = data->proto;
+                nfs_validate_transport_protocol(args);
                break;
        default: {
-                unsigned int len;
+                int status;
                if (nfs_parse_mount_options((char *)options, args) == 0)
                        return -EINVAL;
@@ -1891,44 +2244,25 @@ static int nfs4_validate_mount_data(void *options,
                                                &args->nfs_server.address))
                        return -EINVAL;
-                switch (args->auth_flavor_len) {
+                nfs_set_port((struct sockaddr *)&args->nfs_server.address,
-                case 0:
+                                args->nfs_server.port);
-                        args->auth_flavors[0] = RPC_AUTH_UNIX;
-                        break;
-                case 1:
-                        break;
-                default:
-                        goto out_inval_auth;
-                }
-                /*
+                nfs_validate_transport_protocol(args);
-                 * Split "dev_name" into "hostname:mntpath".
-                 */
-                c = strchr(dev_name, ':');
-                if (c == NULL)
-                        return -EINVAL;
-                /* while calculating len, pretend ':' is '\0' */
-                len = c - dev_name;
-                if (len > NFS4_MAXNAMLEN)
-                        return -ENAMETOOLONG;
-                /* N.B. caller will free nfs_server.hostname in all cases */
-                args->nfs_server.hostname = kstrndup(dev_name, len, GFP_KERNEL);
-                if (!args->nfs_server.hostname)
-                        goto out_nomem;
-                c++;                    /* step over the ':' */
-                len = strlen(c);
-                if (len > NFS4_MAXPATHLEN)
-                        return -ENAMETOOLONG;
-                args->nfs_server.export_path = kstrndup(c, len, GFP_KERNEL);
-                if (!args->nfs_server.export_path)
-                        goto out_nomem;
-                dprintk("NFS: MNTPATH: '%s'\n", args->nfs_server.export_path);
+                if (args->auth_flavor_len > 1)
+                        goto out_inval_auth;
                if (args->client_address == NULL)
                        goto out_no_client_address;
+                status = nfs_parse_devname(dev_name,
+                                           &args->nfs_server.hostname,
+                                           NFS4_MAXNAMLEN,
+                                           &args->nfs_server.export_path,
+                                           NFS4_MAXPATHLEN);
+                if (status < 0)
+                        return status;
                break;
                }
        }
@@ -1944,10 +2278,6 @@ out_inval_auth:
                 data->auth_flavourlen);
        return -EINVAL;
-out_nomem:
-        dfprintk(MOUNT, "NFS4: not enough memory to handle mount options\n");
-        return -ENOMEM;
 out_no_address:
        dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n");
        return -EINVAL;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index f333848fd3be..3229e217c773 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -34,9 +34,6 @@
 /*
 * Local function declarations
 */
-static struct nfs_page * nfs_update_request(struct nfs_open_context*,
-                                            struct page *,
-                                            unsigned int, unsigned int);
 static void nfs_pageio_init_write(struct nfs_pageio_descriptor *desc,
                                  struct inode *inode, int ioflags);
 static void nfs_redirty_request(struct nfs_page *req);
@@ -136,16 +133,21 @@ static struct nfs_page *nfs_page_find_request(struct page *page)
 static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count)
 {
        struct inode *inode = page->mapping->host;
-        loff_t end, i_size = i_size_read(inode);
+        loff_t end, i_size;
-        pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
+        pgoff_t end_index;
+        spin_lock(&inode->i_lock);
+        i_size = i_size_read(inode);
+        end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
        if (i_size > 0 && page->index < end_index)
-                return;
+                goto out;
        end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + ((loff_t)offset+count);
        if (i_size >= end)
-                return;
+                goto out;
-        nfs_inc_stats(inode, NFSIOS_EXTENDWRITE);
        i_size_write(inode, end);
+        nfs_inc_stats(inode, NFSIOS_EXTENDWRITE);
+out:
+        spin_unlock(&inode->i_lock);
 }
 /* A writeback failed: mark the page as bad, and invalidate the page cache */
@@ -169,29 +171,6 @@ static void nfs_mark_uptodate(struct page *page, unsigned int base, unsigned int
        SetPageUptodate(page);
 }
-static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
-                unsigned int offset, unsigned int count)
-{
-        struct nfs_page *req;
-        int ret;
-        for (;;) {
-                req = nfs_update_request(ctx, page, offset, count);
-                if (!IS_ERR(req))
-                        break;
-                ret = PTR_ERR(req);
-                if (ret != -EBUSY)
-                        return ret;
-                ret = nfs_wb_page(page->mapping->host, page);
-                if (ret != 0)
-                        return ret;
-        }
-        /* Update file length */
-        nfs_grow_file(page, offset, count);
-        nfs_clear_page_tag_locked(req);
-        return 0;
-}
 static int wb_priority(struct writeback_control *wbc)
 {
        if (wbc->for_reclaim)
@@ -268,12 +247,9 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
                        return ret;
                spin_lock(&inode->i_lock);
        }
-        if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) {
+        if (test_bit(PG_CLEAN, &req->wb_flags)) {
-                /* This request is marked for commit */
                spin_unlock(&inode->i_lock);
-                nfs_clear_page_tag_locked(req);
+                BUG();
-                nfs_pageio_complete(pgio);
-                return 0;
        }
        if (nfs_set_page_writeback(page) != 0) {
                spin_unlock(&inode->i_lock);
@@ -355,11 +331,19 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
 /*
 * Insert a write request into an inode
 */
-static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
+static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
        int error;
+        error = radix_tree_preload(GFP_NOFS);
+        if (error != 0)
+                goto out;
+        /* Lock the request! */
+        nfs_lock_request_dontget(req);
+        spin_lock(&inode->i_lock);
        error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req);
        BUG_ON(error);
        if (!nfsi->npages) {
@@ -373,6 +357,10 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
        kref_get(&req->wb_kref);
        radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index,
                                NFS_PAGE_TAG_LOCKED);
+        spin_unlock(&inode->i_lock);
+        radix_tree_preload_end();
+out:
+        return error;
 }
 /*
@@ -405,19 +393,6 @@ nfs_mark_request_dirty(struct nfs_page *req)
        __set_page_dirty_nobuffers(req->wb_page);
 }
-/*
- * Check if a request is dirty
- */
-static inline int
-nfs_dirty_request(struct nfs_page *req)
-{
-        struct page *page = req->wb_page;
-        if (page == NULL || test_bit(PG_NEED_COMMIT, &req->wb_flags))
-                return 0;
-        return !PageWriteback(page);
-}
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
 /*
 * Add a request to the inode's commit list.
@@ -430,7 +405,7 @@ nfs_mark_request_commit(struct nfs_page *req)
        spin_lock(&inode->i_lock);
        nfsi->ncommit++;
-        set_bit(PG_NEED_COMMIT, &(req)->wb_flags);
+        set_bit(PG_CLEAN, &(req)->wb_flags);
        radix_tree_tag_set(&nfsi->nfs_page_tree,
                        req->wb_index,
                        NFS_PAGE_TAG_COMMIT);
@@ -440,6 +415,19 @@ nfs_mark_request_commit(struct nfs_page *req)
        __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
 }
+static int
+nfs_clear_request_commit(struct nfs_page *req)
+{
+        struct page *page = req->wb_page;
+        if (test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) {
+                dec_zone_page_state(page, NR_UNSTABLE_NFS);
+                dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE);
+                return 1;
+        }
+        return 0;
+}
 static inline
 int nfs_write_need_commit(struct nfs_write_data *data)
 {
@@ -449,7 +437,7 @@ int nfs_write_need_commit(struct nfs_write_data *data)
 static inline
 int nfs_reschedule_unstable_write(struct nfs_page *req)
 {
-        if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) {
+        if (test_and_clear_bit(PG_NEED_COMMIT, &req->wb_flags)) {
                nfs_mark_request_commit(req);
                return 1;
        }
@@ -465,6 +453,12 @@ nfs_mark_request_commit(struct nfs_page *req)
 {
 }
+static inline int
+nfs_clear_request_commit(struct nfs_page *req)
+{
+        return 0;
+}
 static inline
 int nfs_write_need_commit(struct nfs_write_data *data)
 {
@@ -522,11 +516,8 @@ static void nfs_cancel_commit_list(struct list_head *head)
        while(!list_empty(head)) {
                req = nfs_list_entry(head->next);
-                dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
-                dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
-                                BDI_RECLAIMABLE);
                nfs_list_remove_request(req);
-                clear_bit(PG_NEED_COMMIT, &(req)->wb_flags);
+                nfs_clear_request_commit(req);
                nfs_inode_remove_request(req);
                nfs_unlock_request(req);
        }
@@ -564,110 +555,124 @@ static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pg
 #endif
 /*
- * Try to update any existing write request, or create one if there is none.
+ * Search for an existing write request, and attempt to update
- * In order to match, the request's credentials must match those of
+ * it to reflect a new dirty region on a given page.
- * the calling process.
 *
- * Note: Should always be called with the Page Lock held!
+ * If the attempt fails, then the existing request is flushed out
+ * to disk.
 */
-static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx,
+static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
-                struct page *page, unsigned int offset, unsigned int bytes)
+                struct page *page,
+                unsigned int offset,
+                unsigned int bytes)
 {
-        struct address_space *mapping = page->mapping;
+        struct nfs_page *req;
-        struct inode *inode = mapping->host;
+        unsigned int rqend;
-        struct nfs_page         *req, *new = NULL;
+        unsigned int end;
-        pgoff_t         rqend, end;
+        int error;
+        if (!PagePrivate(page))
+                return NULL;
        end = offset + bytes;
+        spin_lock(&inode->i_lock);
        for (;;) {
-                /* Loop over all inode entries and see if we find
+                req = nfs_page_find_request_locked(page);
-                 * A request for the page we wish to update
+                if (req == NULL)
+                        goto out_unlock;
+                rqend = req->wb_offset + req->wb_bytes;
+                /*
+                 * Tell the caller to flush out the request if
+                 * the offsets are non-contiguous.
+                 * Note: nfs_flush_incompatible() will already
+                 * have flushed out requests having wrong owners.
                 */
-                if (new) {
+                if (offset > rqend
-                        if (radix_tree_preload(GFP_NOFS)) {
+                    || end < req->wb_offset)
-                                nfs_release_request(new);
+                        goto out_flushme;
-                                return ERR_PTR(-ENOMEM);
-                        }
-                }
-                spin_lock(&inode->i_lock);
+                if (nfs_set_page_tag_locked(req))
-                req = nfs_page_find_request_locked(page);
-                if (req) {
-                        if (!nfs_set_page_tag_locked(req)) {
-                                int error;
-                                spin_unlock(&inode->i_lock);
-                                error = nfs_wait_on_request(req);
-                                nfs_release_request(req);
-                                if (error < 0) {
-                                        if (new) {
-                                                radix_tree_preload_end();
-                                                nfs_release_request(new);
-                                        }
-                                        return ERR_PTR(error);
-                                }
-                                continue;
-                        }
-                        spin_unlock(&inode->i_lock);
-                        if (new) {
-                                radix_tree_preload_end();
-                                nfs_release_request(new);
-                        }
                        break;
-                }
-                if (new) {
+                /* The request is locked, so wait and then retry */
-                        nfs_lock_request_dontget(new);
-                        nfs_inode_add_request(inode, new);
-                        spin_unlock(&inode->i_lock);
-                        radix_tree_preload_end();
-                        req = new;
-                        goto zero_page;
-                }
                spin_unlock(&inode->i_lock);
+                error = nfs_wait_on_request(req);
-                new = nfs_create_request(ctx, inode, page, offset, bytes);
+                nfs_release_request(req);
-                if (IS_ERR(new))
+                if (error != 0)
-                        return new;
+                        goto out_err;
+                spin_lock(&inode->i_lock);
        }
-        /* We have a request for our page.
+        if (nfs_clear_request_commit(req))
-         * If the creds don't match, or the
+                radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree,
-         * page addresses don't match,
+                                req->wb_index, NFS_PAGE_TAG_COMMIT);
-         * tell the caller to wait on the conflicting
-         * request.
-         */
-        rqend = req->wb_offset + req->wb_bytes;
-        if (req->wb_context != ctx
-            || req->wb_page != page
-            || !nfs_dirty_request(req)
-            || offset > rqend || end < req->wb_offset) {
-                nfs_clear_page_tag_locked(req);
-                return ERR_PTR(-EBUSY);
-        }
        /* Okay, the request matches. Update the region */
        if (offset < req->wb_offset) {
                req->wb_offset = offset;
                req->wb_pgbase = offset;
-                req->wb_bytes = max(end, rqend) - req->wb_offset;
-                goto zero_page;
        }
        if (end > rqend)
                req->wb_bytes = end - req->wb_offset;
+        else
+                req->wb_bytes = rqend - req->wb_offset;
+out_unlock:
+        spin_unlock(&inode->i_lock);
        return req;
-zero_page:
+out_flushme:
-        /* If this page might potentially be marked as up to date,
+        spin_unlock(&inode->i_lock);
-         * then we need to zero any uninitalised data. */
+        nfs_release_request(req);
-        if (req->wb_pgbase == 0 && req->wb_bytes != PAGE_CACHE_SIZE
+        error = nfs_wb_page(inode, page);
-                        && !PageUptodate(req->wb_page))
+out_err:
-                zero_user_segment(req->wb_page, req->wb_bytes, PAGE_CACHE_SIZE);
+        return ERR_PTR(error);
+}
+/*
+ * Try to update an existing write request, or create one if there is none.
+ *
+ * Note: Should always be called with the Page Lock held to prevent races
+ * if we have to add a new request. Also assumes that the caller has
+ * already called nfs_flush_incompatible() if necessary.
+ */
+static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
+                struct page *page, unsigned int offset, unsigned int bytes)
+{
+        struct inode *inode = page->mapping->host;
+        struct nfs_page *req;
+        int error;
+        req = nfs_try_to_update_request(inode, page, offset, bytes);
+        if (req != NULL)
+                goto out;
+        req = nfs_create_request(ctx, inode, page, offset, bytes);
+        if (IS_ERR(req))
+                goto out;
+        error = nfs_inode_add_request(inode, req);
+        if (error != 0) {
+                nfs_release_request(req);
+                req = ERR_PTR(error);
+        }
+out:
        return req;
 }
+static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
+                unsigned int offset, unsigned int count)
+{
+        struct nfs_page *req;
+        req = nfs_setup_write_request(ctx, page, offset, count);
+        if (IS_ERR(req))
+                return PTR_ERR(req);
+        /* Update file length */
+        nfs_grow_file(page, offset, count);
+        nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
+        nfs_clear_page_tag_locked(req);
+        return 0;
+}
 int nfs_flush_incompatible(struct file *file, struct page *page)
 {
        struct nfs_open_context *ctx = nfs_file_open_context(file);
@@ -685,8 +690,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
                req = nfs_page_find_request(page);
                if (req == NULL)
                        return 0;
-                do_flush = req->wb_page != page || req->wb_context != ctx
+                do_flush = req->wb_page != page || req->wb_context != ctx;
-                        || !nfs_dirty_request(req);
                nfs_release_request(req);
                if (!do_flush)
                        return 0;
@@ -721,10 +725,10 @@ int nfs_updatepage(struct file *file, struct page *page,
        nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE);
-        dprintk("NFS:      nfs_updatepage(%s/%s %d@%Ld)\n",
+        dprintk("NFS:       nfs_updatepage(%s/%s %d@%lld)\n",
                file->f_path.dentry->d_parent->d_name.name,
                file->f_path.dentry->d_name.name, count,
-                (long long)(page_offset(page) +offset));
+                (long long)(page_offset(page) + offset));
        /* If we're not using byte range locks, and we know the page
         * is up to date, it may be more efficient to extend the write
@@ -744,7 +748,7 @@ int nfs_updatepage(struct file *file, struct page *page,
        else
                __set_page_dirty_nobuffers(page);
-        dprintk("NFS:      nfs_updatepage returns %d (isize %Ld)\n",
+        dprintk("NFS:       nfs_updatepage returns %d (isize %lld)\n",
                        status, (long long)i_size_read(inode));
        return status;
 }
@@ -752,12 +756,7 @@ int nfs_updatepage(struct file *file, struct page *page,
 static void nfs_writepage_release(struct nfs_page *req)
 {
-        if (PageError(req->wb_page)) {
+        if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req)) {
-                nfs_end_page_writeback(req->wb_page);
-                nfs_inode_remove_request(req);
-        } else if (!nfs_reschedule_unstable_write(req)) {
-                /* Set the PG_uptodate flag */
-                nfs_mark_uptodate(req->wb_page, req->wb_pgbase, req->wb_bytes);
                nfs_end_page_writeback(req->wb_page);
                nfs_inode_remove_request(req);
        } else
@@ -834,7 +833,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
        NFS_PROTO(inode)->write_setup(data, &msg);
        dprintk("NFS: %5u initiated write call "
-                "(req %s/%Ld, %u bytes @ offset %Lu)\n",
+                "(req %s/%lld, %u bytes @ offset %llu)\n",
                data->task.tk_pid,
                inode->i_sb->s_id,
                (long long)NFS_FILEID(inode),
@@ -978,13 +977,13 @@ static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
 static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata)
 {
        struct nfs_write_data   *data = calldata;
-        struct nfs_page         *req = data->req;
-        dprintk("NFS: write (%s/%Ld %d@%Ld)",
+        dprintk("NFS: %5u write(%s/%lld %d@%lld)",
-                req->wb_context->path.dentry->d_inode->i_sb->s_id,
+                task->tk_pid,
-                (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
+                data->req->wb_context->path.dentry->d_inode->i_sb->s_id,
-                req->wb_bytes,
+                (long long)
-                (long long)req_offset(req));
+                  NFS_FILEID(data->req->wb_context->path.dentry->d_inode),
+                data->req->wb_bytes, (long long)req_offset(data->req));
        nfs_writeback_done(task, data);
 }
@@ -1058,7 +1057,8 @@ static void nfs_writeback_release_full(void *calldata)
                nfs_list_remove_request(req);
-                dprintk("NFS: write (%s/%Ld %d@%Ld)",
+                dprintk("NFS: %5u write (%s/%lld %d@%lld)",
+                        data->task.tk_pid,
                        req->wb_context->path.dentry->d_inode->i_sb->s_id,
                        (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
                        req->wb_bytes,
@@ -1078,8 +1078,6 @@ static void nfs_writeback_release_full(void *calldata)
                        dprintk(" marked for commit\n");
                        goto next;
                }
-                /* Set the PG_uptodate flag? */
-                nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
                dprintk(" OK\n");
 remove_request:
                nfs_end_page_writeback(page);
@@ -1133,7 +1131,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
                static unsigned long    complain;
                if (time_before(complain, jiffies)) {
-                        dprintk("NFS: faulty NFS server %s:"
+                        dprintk("NFS:       faulty NFS server %s:"
                                " (committed = %d) != (stable = %d)\n",
                                NFS_SERVER(data->inode)->nfs_client->cl_hostname,
                                resp->verf->committed, argp->stable);
@@ -1297,12 +1295,9 @@ static void nfs_commit_release(void *calldata)
        while (!list_empty(&data->pages)) {
                req = nfs_list_entry(data->pages.next);
                nfs_list_remove_request(req);
-                clear_bit(PG_NEED_COMMIT, &(req)->wb_flags);
+                nfs_clear_request_commit(req);
-                dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
-                dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
-                                BDI_RECLAIMABLE);
-                dprintk("NFS: commit (%s/%Ld %d@%Ld)",
+                dprintk("NFS:       commit (%s/%lld %d@%lld)",
                        req->wb_context->path.dentry->d_inode->i_sb->s_id,
                        (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
                        req->wb_bytes,
@@ -1318,9 +1313,6 @@ static void nfs_commit_release(void *calldata)
                 * returned by the server against all stored verfs. */
                if (!memcmp(req->wb_verf.verifier, data->verf.verifier, sizeof(data->verf.verifier))) {
                        /* We have a match */
-                        /* Set the PG_uptodate flag */
-                        nfs_mark_uptodate(req->wb_page, req->wb_pgbase,
-                                        req->wb_bytes);
                        nfs_inode_remove_request(req);
                        dprintk(" OK\n");
                        goto next;
@@ -1479,7 +1471,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
                req = nfs_page_find_request(page);
                if (req == NULL)
                        goto out;
-                if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) {
+                if (test_bit(PG_CLEAN, &req->wb_flags)) {
                        nfs_release_request(req);
                        break;
                }
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 4d4760e687c3..702fa577aa6e 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -381,7 +381,7 @@ static int do_probe_callback(void *data)
                .program        = &cb_program,
                .version        = nfs_cb_version[1]->number,
                .authflavor     = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */
-                .flags          = (RPC_CLNT_CREATE_NOPING),
+                .flags          = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
        };
        struct rpc_message msg = {
                .rpc_proc       = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index c021280dd462..bd7e0f3acfc7 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -21,6 +21,7 @@
 #include <linux/fs.h>
 #include <linux/miscdevice.h>
 #include <linux/mutex.h>
+#include <linux/smp_lock.h>
 #include <linux/reboot.h>
 #include <asm/uaccess.h>
@@ -619,10 +620,12 @@ static int ocfs2_control_open(struct inode *inode, struct file *file)
                return -ENOMEM;
        p->op_this_node = -1;
+        lock_kernel();
        mutex_lock(&ocfs2_control_lock);
        file->private_data = p;
        list_add(&p->op_list, &ocfs2_control_private_list);
        mutex_unlock(&ocfs2_control_lock);
+        unlock_kernel();
        return 0;
 }
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3b455371e7ff..58c3e6a8e15e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -233,7 +233,7 @@ static int check_mem_permission(struct task_struct *task)
         */
        if (task->parent == current && (task->ptrace & PT_PTRACED) &&
            task_is_stopped_or_traced(task) &&
-            ptrace_may_attach(task))
+            ptrace_may_access(task, PTRACE_MODE_ATTACH))
                return 0;
        /*
@@ -251,7 +251,8 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
        task_lock(task);
        if (task->mm != mm)
                goto out;
-        if (task->mm != current->mm && __ptrace_may_attach(task) < 0)
+        if (task->mm != current->mm &&
+            __ptrace_may_access(task, PTRACE_MODE_READ) < 0)
                goto out;
        task_unlock(task);
        return mm;
@@ -518,7 +519,7 @@ static int proc_fd_access_allowed(struct inode *inode)
         */
        task = get_proc_task(inode);
        if (task) {
-                allowed = ptrace_may_attach(task);
+                allowed = ptrace_may_access(task, PTRACE_MODE_READ);
                put_task_struct(task);
        }
        return allowed;
@@ -904,7 +905,7 @@ static ssize_t environ_read(struct file *file, char __user *buf,
        if (!task)
                goto out_no_task;
-        if (!ptrace_may_attach(task))
+        if (!ptrace_may_access(task, PTRACE_MODE_READ))
                goto out;
        ret = -ENOMEM;
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 7e277f2ad466..c652d469dc08 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -123,6 +123,11 @@ static int uptime_read_proc(char *page, char **start, off_t off,
        return proc_calc_metrics(page, start, off, count, eof, len);
 }
+int __attribute__((weak)) arch_report_meminfo(char *page)
+{
+        return 0;
+}
 static int meminfo_read_proc(char *page, char **start, off_t off,
                                 int count, int *eof, void *data)
 {
@@ -221,6 +226,8 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
                len += hugetlb_report_meminfo(page + len);
+        len += arch_report_meminfo(page + len);
        return proc_calc_metrics(page, start, off, count, eof, len);
 #undef K
 }
@@ -472,6 +479,13 @@ static const struct file_operations proc_vmalloc_operations = {
 };
 #endif
+#ifndef arch_irq_stat_cpu
+#define arch_irq_stat_cpu(cpu) 0
+#endif
+#ifndef arch_irq_stat
+#define arch_irq_stat() 0
+#endif
 static int show_stat(struct seq_file *p, void *v)
 {
        int i;
@@ -509,7 +523,9 @@ static int show_stat(struct seq_file *p, void *v)
                        sum += temp;
                        per_irq_sum[j] += temp;
                }
+                sum += arch_irq_stat_cpu(i);
        }
+        sum += arch_irq_stat();
        seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
                (unsigned long long)cputime64_to_clock_t(user),
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index c492449f3b45..164bd9f9ede3 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -210,7 +210,7 @@ static int show_map(struct seq_file *m, void *v)
        dev_t dev = 0;
        int len;
-        if (maps_protect && !ptrace_may_attach(task))
+        if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
                return -EACCES;
        if (file) {
@@ -646,7 +646,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
                goto out;
        ret = -EACCES;
-        if (!ptrace_may_attach(task))
+        if (!ptrace_may_access(task, PTRACE_MODE_READ))
                goto out_task;
        ret = -EINVAL;
@@ -747,7 +747,7 @@ static int show_numa_map_checked(struct seq_file *m, void *v)
        struct proc_maps_private *priv = m->private;
        struct task_struct *task = priv->task;
-        if (maps_protect && !ptrace_may_attach(task))
+        if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
                return -EACCES;
        return show_numa_map(m, v);
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 4b4f9cc2f186..5d84e7121df8 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -113,7 +113,7 @@ static int show_map(struct seq_file *m, void *_vml)
        struct proc_maps_private *priv = m->private;
        struct task_struct *task = priv->task;
-        if (maps_protect && !ptrace_may_attach(task))
+        if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
                return -EACCES;
        return nommu_vma_show(m, vml->vma);
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 9590b9024300..78f613cb9c76 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -45,6 +45,7 @@ const struct file_operations ramfs_file_operations = {
        .mmap           = generic_file_mmap,
        .fsync          = simple_sync_file,
        .splice_read    = generic_file_splice_read,
+        .splice_write   = generic_file_splice_write,
        .llseek         = generic_file_llseek,
 };
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 0989bc2c2f69..52312ec93ff4 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -43,6 +43,7 @@ const struct file_operations ramfs_file_operations = {
        .aio_write              = generic_file_aio_write,
        .fsync                  = simple_sync_file,
        .splice_read            = generic_file_splice_read,
+        .splice_write           = generic_file_splice_write,
        .llseek                 = generic_file_llseek,
 };
diff --git a/fs/read_write.c b/fs/read_write.c
index f0d1240a5c69..9ba495d5a29b 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -31,12 +31,12 @@ const struct file_operations generic_ro_fops = {
 EXPORT_SYMBOL(generic_ro_fops);
-loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
+loff_t
+generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
 {
        loff_t retval;
        struct inode *inode = file->f_mapping->host;
-        mutex_lock(&inode->i_mutex);
        switch (origin) {
                case SEEK_END:
                        offset += inode->i_size;
@@ -46,42 +46,26 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
        }
        retval = -EINVAL;
        if (offset>=0 && offset<=inode->i_sb->s_maxbytes) {
+                /* Special lock needed here? */
                if (offset != file->f_pos) {
                        file->f_pos = offset;
                        file->f_version = 0;
                }
                retval = offset;
        }
-        mutex_unlock(&inode->i_mutex);
        return retval;
 }
+EXPORT_SYMBOL(generic_file_llseek_unlocked);
-EXPORT_SYMBOL(generic_file_llseek);
+loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
-loff_t remote_llseek(struct file *file, loff_t offset, int origin)
 {
-        loff_t retval;
+        loff_t n;
+        mutex_lock(&file->f_dentry->d_inode->i_mutex);
-        lock_kernel();
+        n = generic_file_llseek_unlocked(file, offset, origin);
-        switch (origin) {
+        mutex_unlock(&file->f_dentry->d_inode->i_mutex);
-                case SEEK_END:
+        return n;
-                        offset += i_size_read(file->f_path.dentry->d_inode);
-                        break;
-                case SEEK_CUR:
-                        offset += file->f_pos;
-        }
-        retval = -EINVAL;
-        if (offset>=0 && offset<=file->f_path.dentry->d_inode->i_sb->s_maxbytes) {
-                if (offset != file->f_pos) {
-                        file->f_pos = offset;
-                        file->f_version = 0;
-                }
-                retval = offset;
-        }
-        unlock_kernel();
-        return retval;
 }
-EXPORT_SYMBOL(remote_llseek);
+EXPORT_SYMBOL(generic_file_llseek);
 loff_t no_llseek(struct file *file, loff_t offset, int origin)
 {
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index efbe29af3d7a..2294783320cb 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -422,9 +422,18 @@ smb_file_permission(struct inode *inode, int mask, struct nameidata *nd)
        return error;
 }
+static loff_t smb_remote_llseek(struct file *file, loff_t offset, int origin)
+{
+        loff_t ret;
+        lock_kernel();
+        ret = generic_file_llseek_unlocked(file, offset, origin);
+        unlock_kernel();
+        return ret;
+}
 const struct file_operations smb_file_operations =
 {
-        .llseek         = remote_llseek,
+        .llseek         = smb_remote_llseek,
        .read           = do_sync_read,
        .aio_read       = smb_file_aio_read,
        .write          = do_sync_write,
diff --git a/fs/splice.c b/fs/splice.c
index aa5f6f60b305..399442179d89 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -379,13 +379,22 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
                                lock_page(page);
                        /*
-                         * page was truncated, stop here. if this isn't the
+                         * Page was truncated, or invalidated by the
-                         * first page, we'll just complete what we already
+                         * filesystem.  Redo the find/create, but this time the
-                         * added
+                         * page is kept locked, so there's no chance of another
+                         * race with truncate/invalidate.
                         */
                        if (!page->mapping) {
                                unlock_page(page);
-                                break;
+                                page = find_or_create_page(mapping, index,
+                                                mapping_gfp_mask(mapping));
+                                if (!page) {
+                                        error = -ENOMEM;
+                                        break;
+                                }
+                                page_cache_release(pages[page_nr]);
+                                pages[page_nr] = page;
                        }
                        /*
                         * page was already under io and is now done, great
diff --git a/fs/vfat/namei.c b/fs/vfat/namei.c
index a3522727ea5b..b546ba69be82 100644
--- a/fs/vfat/namei.c
+++ b/fs/vfat/namei.c
@@ -645,7 +645,7 @@ static int vfat_add_entry(struct inode *dir, struct qstr *qname, int is_dir,
        if (len == 0)
                return -ENOENT;
-        slots = kmalloc(sizeof(*slots) * MSDOS_SLOTS, GFP_KERNEL);
+        slots = kmalloc(sizeof(*slots) * MSDOS_SLOTS, GFP_NOFS);
        if (slots == NULL)
                return -ENOMEM;
@@ -687,7 +687,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
        struct dentry *alias;
        int err, table;
-        lock_kernel();
+        lock_super(sb);
        table = (MSDOS_SB(sb)->options.name_check == 's') ? 2 : 0;
        dentry->d_op = &vfat_dentry_ops[table];
@@ -699,7 +699,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
        inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
        brelse(sinfo.bh);
        if (IS_ERR(inode)) {
-                unlock_kernel();
+                unlock_super(sb);
                return ERR_CAST(inode);
        }
        alias = d_find_alias(inode);
@@ -708,13 +708,13 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
                        dput(alias);
                else {
                        iput(inode);
-                        unlock_kernel();
+                        unlock_super(sb);
                        return alias;
                }
        }
 error:
-        unlock_kernel();
+        unlock_super(sb);
        dentry->d_op = &vfat_dentry_ops[table];
        dentry->d_time = dentry->d_parent->d_inode->i_version;
        dentry = d_splice_alias(inode, dentry);
@@ -734,7 +734,7 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, int mode,
        struct timespec ts;
        int err;
-        lock_kernel();
+        lock_super(sb);
        ts = CURRENT_TIME_SEC;
        err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &ts, &sinfo);
@@ -755,17 +755,18 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, int mode,
        dentry->d_time = dentry->d_parent->d_inode->i_version;
        d_instantiate(dentry, inode);
 out:
-        unlock_kernel();
+        unlock_super(sb);
        return err;
 }
 static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
 {
        struct inode *inode = dentry->d_inode;
+        struct super_block *sb = dir->i_sb;
        struct fat_slot_info sinfo;
        int err;
-        lock_kernel();
+        lock_super(sb);
        err = fat_dir_empty(inode);
        if (err)
@@ -783,7 +784,7 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
        inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
        fat_detach(inode);
 out:
-        unlock_kernel();
+        unlock_super(sb);
        return err;
 }
@@ -791,10 +792,11 @@ out:
 static int vfat_unlink(struct inode *dir, struct dentry *dentry)
 {
        struct inode *inode = dentry->d_inode;
+        struct super_block *sb = dir->i_sb;
        struct fat_slot_info sinfo;
        int err;
-        lock_kernel();
+        lock_super(sb);
        err = vfat_find(dir, &dentry->d_name, &sinfo);
        if (err)
@@ -807,7 +809,7 @@ static int vfat_unlink(struct inode *dir, struct dentry *dentry)
        inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
        fat_detach(inode);
 out:
-        unlock_kernel();
+        unlock_super(sb);
        return err;
 }
@@ -820,7 +822,7 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        struct timespec ts;
        int err, cluster;
-        lock_kernel();
+        lock_super(sb);
        ts = CURRENT_TIME_SEC;
        cluster = fat_alloc_new_dir(dir, &ts);
@@ -849,13 +851,13 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        dentry->d_time = dentry->d_parent->d_inode->i_version;
        d_instantiate(dentry, inode);
-        unlock_kernel();
+        unlock_super(sb);
        return 0;
 out_free:
        fat_free_clusters(dir, cluster);
 out:
-        unlock_kernel();
+        unlock_super(sb);
        return err;
 }
@@ -869,11 +871,12 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct timespec ts;
        loff_t dotdot_i_pos, new_i_pos;
        int err, is_dir, update_dotdot, corrupt = 0;
+        struct super_block *sb = old_dir->i_sb;
        old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
        old_inode = old_dentry->d_inode;
        new_inode = new_dentry->d_inode;
-        lock_kernel();
+        lock_super(sb);
        err = vfat_find(old_dir, &old_dentry->d_name, &old_sinfo);
        if (err)
                goto out;
@@ -951,7 +954,7 @@ out:
        brelse(sinfo.bh);
        brelse(dotdot_bh);
        brelse(old_sinfo.bh);
-        unlock_kernel();
+        unlock_super(sb);
        return err;