102 files changed, 4902 insertions, 3225 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 0ce72dcd6b96..84ab76a206a0 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -930,7 +930,7 @@ config PROC_KCORE
 config PROC_VMCORE
        bool "/proc/vmcore support (EXPERIMENTAL)"
-        depends on PROC_FS && EXPERIMENTAL && CRASH_DUMP
+        depends on PROC_FS && CRASH_DUMP
        default y
        help
        Exports the dump image of crashed kernel in ELF format.
diff --git a/fs/Makefile b/fs/Makefile
index 1e7a11bd4da1..277b079dec9e 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -19,6 +19,7 @@ else
 obj-y +=        no-block.o
 endif
+obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o
 obj-$(CONFIG_INOTIFY)           += inotify.o
 obj-$(CONFIG_INOTIFY_USER)      += inotify_user.o
 obj-$(CONFIG_EPOLL)             += eventpoll.o
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
new file mode 100644
index 000000000000..63e2ee63058d
--- /dev/null
+++ b/fs/bio-integrity.c
@@ -0,0 +1,719 @@
+/*
+ * bio-integrity.c - bio data integrity extensions
+ *
+ * Copyright (C) 2007, 2008 Oracle Corporation
+ * Written by: Martin K. Petersen <martin.petersen@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
+ * USA.
+ *
+ */
+#include <linux/blkdev.h>
+#include <linux/mempool.h>
+#include <linux/bio.h>
+#include <linux/workqueue.h>
+static struct kmem_cache *bio_integrity_slab __read_mostly;
+static struct workqueue_struct *kintegrityd_wq;
+/**
+ * bio_integrity_alloc_bioset - Allocate integrity payload and attach it to bio
+ * @bio:        bio to attach integrity metadata to
+ * @gfp_mask:   Memory allocation mask
+ * @nr_vecs:    Number of integrity metadata scatter-gather elements
+ * @bs:         bio_set to allocate from
+ *
+ * Description: This function prepares a bio for attaching integrity
+ * metadata.  nr_vecs specifies the maximum number of pages containing
+ * integrity metadata that can be attached.
+ */
+struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
+                                                         gfp_t gfp_mask,
+                                                         unsigned int nr_vecs,
+                                                         struct bio_set *bs)
+{
+        struct bio_integrity_payload *bip;
+        struct bio_vec *iv;
+        unsigned long idx;
+        BUG_ON(bio == NULL);
+        bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
+        if (unlikely(bip == NULL)) {
+                printk(KERN_ERR "%s: could not alloc bip\n", __func__);
+                return NULL;
+        }
+        memset(bip, 0, sizeof(*bip));
+        iv = bvec_alloc_bs(gfp_mask, nr_vecs, &idx, bs);
+        if (unlikely(iv == NULL)) {
+                printk(KERN_ERR "%s: could not alloc bip_vec\n", __func__);
+                mempool_free(bip, bs->bio_integrity_pool);
+                return NULL;
+        }
+        bip->bip_pool = idx;
+        bip->bip_vec = iv;
+        bip->bip_bio = bio;
+        bio->bi_integrity = bip;
+        return bip;
+}
+EXPORT_SYMBOL(bio_integrity_alloc_bioset);
+/**
+ * bio_integrity_alloc - Allocate integrity payload and attach it to bio
+ * @bio:        bio to attach integrity metadata to
+ * @gfp_mask:   Memory allocation mask
+ * @nr_vecs:    Number of integrity metadata scatter-gather elements
+ *
+ * Description: This function prepares a bio for attaching integrity
+ * metadata.  nr_vecs specifies the maximum number of pages containing
+ * integrity metadata that can be attached.
+ */
+struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
+                                                  gfp_t gfp_mask,
+                                                  unsigned int nr_vecs)
+{
+        return bio_integrity_alloc_bioset(bio, gfp_mask, nr_vecs, fs_bio_set);
+}
+EXPORT_SYMBOL(bio_integrity_alloc);
+/**
+ * bio_integrity_free - Free bio integrity payload
+ * @bio:        bio containing bip to be freed
+ * @bs:         bio_set this bio was allocated from
+ *
+ * Description: Used to free the integrity portion of a bio. Usually
+ * called from bio_free().
+ */
+void bio_integrity_free(struct bio *bio, struct bio_set *bs)
+{
+        struct bio_integrity_payload *bip = bio->bi_integrity;
+        BUG_ON(bip == NULL);
+        /* A cloned bio doesn't own the integrity metadata */
+        if (!bio_flagged(bio, BIO_CLONED) && bip->bip_buf != NULL)
+                kfree(bip->bip_buf);
+        mempool_free(bip->bip_vec, bs->bvec_pools[bip->bip_pool]);
+        mempool_free(bip, bs->bio_integrity_pool);
+        bio->bi_integrity = NULL;
+}
+EXPORT_SYMBOL(bio_integrity_free);
+/**
+ * bio_integrity_add_page - Attach integrity metadata
+ * @bio:        bio to update
+ * @page:       page containing integrity metadata
+ * @len:        number of bytes of integrity metadata in page
+ * @offset:     start offset within page
+ *
+ * Description: Attach a page containing integrity metadata to bio.
+ */
+int bio_integrity_add_page(struct bio *bio, struct page *page,
+                           unsigned int len, unsigned int offset)
+{
+        struct bio_integrity_payload *bip = bio->bi_integrity;
+        struct bio_vec *iv;
+        if (bip->bip_vcnt >= bvec_nr_vecs(bip->bip_pool)) {
+                printk(KERN_ERR "%s: bip_vec full\n", __func__);
+                return 0;
+        }
+        iv = bip_vec_idx(bip, bip->bip_vcnt);
+        BUG_ON(iv == NULL);
+        BUG_ON(iv->bv_page != NULL);
+        iv->bv_page = page;
+        iv->bv_len = len;
+        iv->bv_offset = offset;
+        bip->bip_vcnt++;
+        return len;
+}
+EXPORT_SYMBOL(bio_integrity_add_page);
+/**
+ * bio_integrity_enabled - Check whether integrity can be passed
+ * @bio:        bio to check
+ *
+ * Description: Determines whether bio_integrity_prep() can be called
+ * on this bio or not.  bio data direction and target device must be
+ * set prior to calling.  The functions honors the write_generate and
+ * read_verify flags in sysfs.
+ */
+int bio_integrity_enabled(struct bio *bio)
+{
+        /* Already protected? */
+        if (bio_integrity(bio))
+                return 0;
+        return bdev_integrity_enabled(bio->bi_bdev, bio_data_dir(bio));
+}
+EXPORT_SYMBOL(bio_integrity_enabled);
+/**
+ * bio_integrity_hw_sectors - Convert 512b sectors to hardware ditto
+ * @bi:         blk_integrity profile for device
+ * @sectors:    Number of 512 sectors to convert
+ *
+ * Description: The block layer calculates everything in 512 byte
+ * sectors but integrity metadata is done in terms of the hardware
+ * sector size of the storage device.  Convert the block layer sectors
+ * to physical sectors.
+ */
+static inline unsigned int bio_integrity_hw_sectors(struct blk_integrity *bi,
+                                                    unsigned int sectors)
+{
+        /* At this point there are only 512b or 4096b DIF/EPP devices */
+        if (bi->sector_size == 4096)
+                return sectors >>= 3;
+        return sectors;
+}
+/**
+ * bio_integrity_tag_size - Retrieve integrity tag space
+ * @bio:        bio to inspect
+ *
+ * Description: Returns the maximum number of tag bytes that can be
+ * attached to this bio. Filesystems can use this to determine how
+ * much metadata to attach to an I/O.
+ */
+unsigned int bio_integrity_tag_size(struct bio *bio)
+{
+        struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+        BUG_ON(bio->bi_size == 0);
+        return bi->tag_size * (bio->bi_size / bi->sector_size);
+}
+EXPORT_SYMBOL(bio_integrity_tag_size);
+int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, int set)
+{
+        struct bio_integrity_payload *bip = bio->bi_integrity;
+        struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+        unsigned int nr_sectors;
+        BUG_ON(bip->bip_buf == NULL);
+        if (bi->tag_size == 0)
+                return -1;
+        nr_sectors = bio_integrity_hw_sectors(bi,
+                                        DIV_ROUND_UP(len, bi->tag_size));
+        if (nr_sectors * bi->tuple_size > bip->bip_size) {
+                printk(KERN_ERR "%s: tag too big for bio: %u > %u\n",
+                       __func__, nr_sectors * bi->tuple_size, bip->bip_size);
+                return -1;
+        }
+        if (set)
+                bi->set_tag_fn(bip->bip_buf, tag_buf, nr_sectors);
+        else
+                bi->get_tag_fn(bip->bip_buf, tag_buf, nr_sectors);
+        return 0;
+}
+/**
+ * bio_integrity_set_tag - Attach a tag buffer to a bio
+ * @bio:        bio to attach buffer to
+ * @tag_buf:    Pointer to a buffer containing tag data
+ * @len:        Length of the included buffer
+ *
+ * Description: Use this function to tag a bio by leveraging the extra
+ * space provided by devices formatted with integrity protection.  The
+ * size of the integrity buffer must be <= to the size reported by
+ * bio_integrity_tag_size().
+ */
+int bio_integrity_set_tag(struct bio *bio, void *tag_buf, unsigned int len)
+{
+        BUG_ON(bio_data_dir(bio) != WRITE);
+        return bio_integrity_tag(bio, tag_buf, len, 1);
+}
+EXPORT_SYMBOL(bio_integrity_set_tag);
+/**
+ * bio_integrity_get_tag - Retrieve a tag buffer from a bio
+ * @bio:        bio to retrieve buffer from
+ * @tag_buf:    Pointer to a buffer for the tag data
+ * @len:        Length of the target buffer
+ *
+ * Description: Use this function to retrieve the tag buffer from a
+ * completed I/O. The size of the integrity buffer must be <= to the
+ * size reported by bio_integrity_tag_size().
+ */
+int bio_integrity_get_tag(struct bio *bio, void *tag_buf, unsigned int len)
+{
+        BUG_ON(bio_data_dir(bio) != READ);
+        return bio_integrity_tag(bio, tag_buf, len, 0);
+}
+EXPORT_SYMBOL(bio_integrity_get_tag);
+/**
+ * bio_integrity_generate - Generate integrity metadata for a bio
+ * @bio:        bio to generate integrity metadata for
+ *
+ * Description: Generates integrity metadata for a bio by calling the
+ * block device's generation callback function.  The bio must have a
+ * bip attached with enough room to accommodate the generated
+ * integrity metadata.
+ */
+static void bio_integrity_generate(struct bio *bio)
+{
+        struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+        struct blk_integrity_exchg bix;
+        struct bio_vec *bv;
+        sector_t sector = bio->bi_sector;
+        unsigned int i, sectors, total;
+        void *prot_buf = bio->bi_integrity->bip_buf;
+        total = 0;
+        bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
+        bix.sector_size = bi->sector_size;
+        bio_for_each_segment(bv, bio, i) {
+                void *kaddr = kmap_atomic(bv->bv_page, KM_USER0);
+                bix.data_buf = kaddr + bv->bv_offset;
+                bix.data_size = bv->bv_len;
+                bix.prot_buf = prot_buf;
+                bix.sector = sector;
+                bi->generate_fn(&bix);
+                sectors = bv->bv_len / bi->sector_size;
+                sector += sectors;
+                prot_buf += sectors * bi->tuple_size;
+                total += sectors * bi->tuple_size;
+                BUG_ON(total > bio->bi_integrity->bip_size);
+                kunmap_atomic(kaddr, KM_USER0);
+        }
+}
+/**
+ * bio_integrity_prep - Prepare bio for integrity I/O
+ * @bio:        bio to prepare
+ *
+ * Description: Allocates a buffer for integrity metadata, maps the
+ * pages and attaches them to a bio.  The bio must have data
+ * direction, target device and start sector set priot to calling.  In
+ * the WRITE case, integrity metadata will be generated using the
+ * block device's integrity function.  In the READ case, the buffer
+ * will be prepared for DMA and a suitable end_io handler set up.
+ */
+int bio_integrity_prep(struct bio *bio)
+{
+        struct bio_integrity_payload *bip;
+        struct blk_integrity *bi;
+        struct request_queue *q;
+        void *buf;
+        unsigned long start, end;
+        unsigned int len, nr_pages;
+        unsigned int bytes, offset, i;
+        unsigned int sectors;
+        bi = bdev_get_integrity(bio->bi_bdev);
+        q = bdev_get_queue(bio->bi_bdev);
+        BUG_ON(bi == NULL);
+        BUG_ON(bio_integrity(bio));
+        sectors = bio_integrity_hw_sectors(bi, bio_sectors(bio));
+        /* Allocate kernel buffer for protection data */
+        len = sectors * blk_integrity_tuple_size(bi);
+        buf = kmalloc(len, GFP_NOIO | __GFP_NOFAIL | q->bounce_gfp);
+        if (unlikely(buf == NULL)) {
+                printk(KERN_ERR "could not allocate integrity buffer\n");
+                return -EIO;
+        }
+        end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+        start = ((unsigned long) buf) >> PAGE_SHIFT;
+        nr_pages = end - start;
+        /* Allocate bio integrity payload and integrity vectors */
+        bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages);
+        if (unlikely(bip == NULL)) {
+                printk(KERN_ERR "could not allocate data integrity bioset\n");
+                kfree(buf);
+                return -EIO;
+        }
+        bip->bip_buf = buf;
+        bip->bip_size = len;
+        bip->bip_sector = bio->bi_sector;
+        /* Map it */
+        offset = offset_in_page(buf);
+        for (i = 0 ; i < nr_pages ; i++) {
+                int ret;
+                bytes = PAGE_SIZE - offset;
+                if (len <= 0)
+                        break;
+                if (bytes > len)
+                        bytes = len;
+                ret = bio_integrity_add_page(bio, virt_to_page(buf),
+                                             bytes, offset);
+                if (ret == 0)
+                        return 0;
+                if (ret < bytes)
+                        break;
+                buf += bytes;
+                len -= bytes;
+                offset = 0;
+        }
+        /* Install custom I/O completion handler if read verify is enabled */
+        if (bio_data_dir(bio) == READ) {
+                bip->bip_end_io = bio->bi_end_io;
+                bio->bi_end_io = bio_integrity_endio;
+        }
+        /* Auto-generate integrity metadata if this is a write */
+        if (bio_data_dir(bio) == WRITE)
+                bio_integrity_generate(bio);
+        return 0;
+}
+EXPORT_SYMBOL(bio_integrity_prep);
+/**
+ * bio_integrity_verify - Verify integrity metadata for a bio
+ * @bio:        bio to verify
+ *
+ * Description: This function is called to verify the integrity of a
+ * bio.  The data in the bio io_vec is compared to the integrity
+ * metadata returned by the HBA.
+ */
+static int bio_integrity_verify(struct bio *bio)
+{
+        struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+        struct blk_integrity_exchg bix;
+        struct bio_vec *bv;
+        sector_t sector = bio->bi_integrity->bip_sector;
+        unsigned int i, sectors, total, ret;
+        void *prot_buf = bio->bi_integrity->bip_buf;
+        ret = total = 0;
+        bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
+        bix.sector_size = bi->sector_size;
+        bio_for_each_segment(bv, bio, i) {
+                void *kaddr = kmap_atomic(bv->bv_page, KM_USER0);
+                bix.data_buf = kaddr + bv->bv_offset;
+                bix.data_size = bv->bv_len;
+                bix.prot_buf = prot_buf;
+                bix.sector = sector;
+                ret = bi->verify_fn(&bix);
+                if (ret) {
+                        kunmap_atomic(kaddr, KM_USER0);
+                        break;
+                }
+                sectors = bv->bv_len / bi->sector_size;
+                sector += sectors;
+                prot_buf += sectors * bi->tuple_size;
+                total += sectors * bi->tuple_size;
+                BUG_ON(total > bio->bi_integrity->bip_size);
+                kunmap_atomic(kaddr, KM_USER0);
+        }
+        return ret;
+}
+/**
+ * bio_integrity_verify_fn - Integrity I/O completion worker
+ * @work:       Work struct stored in bio to be verified
+ *
+ * Description: This workqueue function is called to complete a READ
+ * request.  The function verifies the transferred integrity metadata
+ * and then calls the original bio end_io function.
+ */
+static void bio_integrity_verify_fn(struct work_struct *work)
+{
+        struct bio_integrity_payload *bip =
+                container_of(work, struct bio_integrity_payload, bip_work);
+        struct bio *bio = bip->bip_bio;
+        int error = bip->bip_error;
+        if (bio_integrity_verify(bio)) {
+                clear_bit(BIO_UPTODATE, &bio->bi_flags);
+                error = -EIO;
+        }
+        /* Restore original bio completion handler */
+        bio->bi_end_io = bip->bip_end_io;
+        if (bio->bi_end_io)
+                bio->bi_end_io(bio, error);
+}
+/**
+ * bio_integrity_endio - Integrity I/O completion function
+ * @bio:        Protected bio
+ * @error:      Pointer to errno
+ *
+ * Description: Completion for integrity I/O
+ *
+ * Normally I/O completion is done in interrupt context.  However,
+ * verifying I/O integrity is a time-consuming task which must be run
+ * in process context.  This function postpones completion
+ * accordingly.
+ */
+void bio_integrity_endio(struct bio *bio, int error)
+{
+        struct bio_integrity_payload *bip = bio->bi_integrity;
+        BUG_ON(bip->bip_bio != bio);
+        bip->bip_error = error;
+        INIT_WORK(&bip->bip_work, bio_integrity_verify_fn);
+        queue_work(kintegrityd_wq, &bip->bip_work);
+}
+EXPORT_SYMBOL(bio_integrity_endio);
+/**
+ * bio_integrity_mark_head - Advance bip_vec skip bytes
+ * @bip:        Integrity vector to advance
+ * @skip:       Number of bytes to advance it
+ */
+void bio_integrity_mark_head(struct bio_integrity_payload *bip,
+                             unsigned int skip)
+{
+        struct bio_vec *iv;
+        unsigned int i;
+        bip_for_each_vec(iv, bip, i) {
+                if (skip == 0) {
+                        bip->bip_idx = i;
+                        return;
+                } else if (skip >= iv->bv_len) {
+                        skip -= iv->bv_len;
+                } else { /* skip < iv->bv_len) */
+                        iv->bv_offset += skip;
+                        iv->bv_len -= skip;
+                        bip->bip_idx = i;
+                        return;
+                }
+        }
+}
+/**
+ * bio_integrity_mark_tail - Truncate bip_vec to be len bytes long
+ * @bip:        Integrity vector to truncate
+ * @len:        New length of integrity vector
+ */
+void bio_integrity_mark_tail(struct bio_integrity_payload *bip,
+                             unsigned int len)
+{
+        struct bio_vec *iv;
+        unsigned int i;
+        bip_for_each_vec(iv, bip, i) {
+                if (len == 0) {
+                        bip->bip_vcnt = i;
+                        return;
+                } else if (len >= iv->bv_len) {
+                        len -= iv->bv_len;
+                } else { /* len < iv->bv_len) */
+                        iv->bv_len = len;
+                        len = 0;
+                }
+        }
+}
+/**
+ * bio_integrity_advance - Advance integrity vector
+ * @bio:        bio whose integrity vector to update
+ * @bytes_done: number of data bytes that have been completed
+ *
+ * Description: This function calculates how many integrity bytes the
+ * number of completed data bytes correspond to and advances the
+ * integrity vector accordingly.
+ */
+void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
+{
+        struct bio_integrity_payload *bip = bio->bi_integrity;
+        struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+        unsigned int nr_sectors;
+        BUG_ON(bip == NULL);
+        BUG_ON(bi == NULL);
+        nr_sectors = bio_integrity_hw_sectors(bi, bytes_done >> 9);
+        bio_integrity_mark_head(bip, nr_sectors * bi->tuple_size);
+}
+EXPORT_SYMBOL(bio_integrity_advance);
+/**
+ * bio_integrity_trim - Trim integrity vector
+ * @bio:        bio whose integrity vector to update
+ * @offset:     offset to first data sector
+ * @sectors:    number of data sectors
+ *
+ * Description: Used to trim the integrity vector in a cloned bio.
+ * The ivec will be advanced corresponding to 'offset' data sectors
+ * and the length will be truncated corresponding to 'len' data
+ * sectors.
+ */
+void bio_integrity_trim(struct bio *bio, unsigned int offset,
+                        unsigned int sectors)
+{
+        struct bio_integrity_payload *bip = bio->bi_integrity;
+        struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+        unsigned int nr_sectors;
+        BUG_ON(bip == NULL);
+        BUG_ON(bi == NULL);
+        BUG_ON(!bio_flagged(bio, BIO_CLONED));
+        nr_sectors = bio_integrity_hw_sectors(bi, sectors);
+        bip->bip_sector = bip->bip_sector + offset;
+        bio_integrity_mark_head(bip, offset * bi->tuple_size);
+        bio_integrity_mark_tail(bip, sectors * bi->tuple_size);
+}
+EXPORT_SYMBOL(bio_integrity_trim);
+/**
+ * bio_integrity_split - Split integrity metadata
+ * @bio:        Protected bio
+ * @bp:         Resulting bio_pair
+ * @sectors:    Offset
+ *
+ * Description: Splits an integrity page into a bio_pair.
+ */
+void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors)
+{
+        struct blk_integrity *bi;
+        struct bio_integrity_payload *bip = bio->bi_integrity;
+        unsigned int nr_sectors;
+        if (bio_integrity(bio) == 0)
+                return;
+        bi = bdev_get_integrity(bio->bi_bdev);
+        BUG_ON(bi == NULL);
+        BUG_ON(bip->bip_vcnt != 1);
+        nr_sectors = bio_integrity_hw_sectors(bi, sectors);
+        bp->bio1.bi_integrity = &bp->bip1;
+        bp->bio2.bi_integrity = &bp->bip2;
+        bp->iv1 = bip->bip_vec[0];
+        bp->iv2 = bip->bip_vec[0];
+        bp->bip1.bip_vec = &bp->iv1;
+        bp->bip2.bip_vec = &bp->iv2;
+        bp->iv1.bv_len = sectors * bi->tuple_size;
+        bp->iv2.bv_offset += sectors * bi->tuple_size;
+        bp->iv2.bv_len -= sectors * bi->tuple_size;
+        bp->bip1.bip_sector = bio->bi_integrity->bip_sector;
+        bp->bip2.bip_sector = bio->bi_integrity->bip_sector + nr_sectors;
+        bp->bip1.bip_vcnt = bp->bip2.bip_vcnt = 1;
+        bp->bip1.bip_idx = bp->bip2.bip_idx = 0;
+}
+EXPORT_SYMBOL(bio_integrity_split);
+/**
+ * bio_integrity_clone - Callback for cloning bios with integrity metadata
+ * @bio:        New bio
+ * @bio_src:    Original bio
+ * @bs:         bio_set to allocate bip from
+ *
+ * Description: Called to allocate a bip when cloning a bio
+ */
+int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
+                        struct bio_set *bs)
+{
+        struct bio_integrity_payload *bip_src = bio_src->bi_integrity;
+        struct bio_integrity_payload *bip;
+        BUG_ON(bip_src == NULL);
+        bip = bio_integrity_alloc_bioset(bio, GFP_NOIO, bip_src->bip_vcnt, bs);
+        if (bip == NULL)
+                return -EIO;
+        memcpy(bip->bip_vec, bip_src->bip_vec,
+               bip_src->bip_vcnt * sizeof(struct bio_vec));
+        bip->bip_sector = bip_src->bip_sector;
+        bip->bip_vcnt = bip_src->bip_vcnt;
+        bip->bip_idx = bip_src->bip_idx;
+        return 0;
+}
+EXPORT_SYMBOL(bio_integrity_clone);
+int bioset_integrity_create(struct bio_set *bs, int pool_size)
+{
+        bs->bio_integrity_pool = mempool_create_slab_pool(pool_size,
+                                                          bio_integrity_slab);
+        if (!bs->bio_integrity_pool)
+                return -1;
+        return 0;
+}
+EXPORT_SYMBOL(bioset_integrity_create);
+void bioset_integrity_free(struct bio_set *bs)
+{
+        if (bs->bio_integrity_pool)
+                mempool_destroy(bs->bio_integrity_pool);
+}
+EXPORT_SYMBOL(bioset_integrity_free);
+void __init bio_integrity_init_slab(void)
+{
+        bio_integrity_slab = KMEM_CACHE(bio_integrity_payload,
+                                        SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+}
+EXPORT_SYMBOL(bio_integrity_init_slab);
+static int __init integrity_init(void)
+{
+        kintegrityd_wq = create_workqueue("kintegrityd");
+        if (!kintegrityd_wq)
+                panic("Failed to create kintegrityd\n");
+        return 0;
+}
+subsys_initcall(integrity_init);
diff --git a/fs/bio.c b/fs/bio.c
index 78562574cb52..88322b066acb 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -28,25 +28,10 @@
 #include <linux/blktrace_api.h>
 #include <scsi/sg.h>            /* for struct sg_iovec */
-#define BIO_POOL_SIZE 2
 static struct kmem_cache *bio_slab __read_mostly;
-#define BIOVEC_NR_POOLS 6
-/*
- * a small number of entries is fine, not going to be performance critical.
- * basically we just need to survive
- */
-#define BIO_SPLIT_ENTRIES 2
 mempool_t *bio_split_pool __read_mostly;
-struct biovec_slab {
-        int nr_vecs;
-        char *name; 
-        struct kmem_cache *slab;
-};
 /*
 * if you change this list, also change bvec_alloc or things will
 * break badly! cannot be bigger than what you can fit into an
@@ -60,23 +45,17 @@ static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
 #undef BV
 /*
- * bio_set is used to allow other portions of the IO system to
- * allocate their own private memory pools for bio and iovec structures.
- * These memory pools in turn all allocate from the bio_slab
- * and the bvec_slabs[].
- */
-struct bio_set {
-        mempool_t *bio_pool;
-        mempool_t *bvec_pools[BIOVEC_NR_POOLS];
-};
-/*
 * fs_bio_set is the bio_set containing bio and iovec memory pools used by
 * IO code that does not need private memory pools.
 */
-static struct bio_set *fs_bio_set;
+struct bio_set *fs_bio_set;
+unsigned int bvec_nr_vecs(unsigned short idx)
+{
+        return bvec_slabs[idx].nr_vecs;
+}
-static inline struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs)
+struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs)
 {
        struct bio_vec *bvl;
@@ -117,6 +96,9 @@ void bio_free(struct bio *bio, struct bio_set *bio_set)
                mempool_free(bio->bi_io_vec, bio_set->bvec_pools[pool_idx]);
        }
+        if (bio_integrity(bio))
+                bio_integrity_free(bio, bio_set);
        mempool_free(bio, bio_set->bio_pool);
 }
@@ -275,9 +257,19 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
 {
        struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set);
-        if (b) {
+        if (!b)
-                b->bi_destructor = bio_fs_destructor;
+                return NULL;
-                __bio_clone(b, bio);
+        b->bi_destructor = bio_fs_destructor;
+        __bio_clone(b, bio);
+        if (bio_integrity(bio)) {
+                int ret;
+                ret = bio_integrity_clone(b, bio, fs_bio_set);
+                if (ret < 0)
+                        return NULL;
        }
        return b;
@@ -333,10 +325,19 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
                if (page == prev->bv_page &&
                    offset == prev->bv_offset + prev->bv_len) {
                        prev->bv_len += len;
-                        if (q->merge_bvec_fn &&
-                            q->merge_bvec_fn(q, bio, prev) < len) {
+                        if (q->merge_bvec_fn) {
-                                prev->bv_len -= len;
+                                struct bvec_merge_data bvm = {
-                                return 0;
+                                        .bi_bdev = bio->bi_bdev,
+                                        .bi_sector = bio->bi_sector,
+                                        .bi_size = bio->bi_size,
+                                        .bi_rw = bio->bi_rw,
+                                };
+                                if (q->merge_bvec_fn(q, &bvm, prev) < len) {
+                                        prev->bv_len -= len;
+                                        return 0;
+                                }
                        }
                        goto done;
@@ -377,11 +378,18 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
         * queue to get further control
         */
        if (q->merge_bvec_fn) {
+                struct bvec_merge_data bvm = {
+                        .bi_bdev = bio->bi_bdev,
+                        .bi_sector = bio->bi_sector,
+                        .bi_size = bio->bi_size,
+                        .bi_rw = bio->bi_rw,
+                };
                /*
                 * merge_bvec_fn() returns number of bytes it can accept
                 * at this offset
                 */
-                if (q->merge_bvec_fn(q, bio, bvec) < len) {
+                if (q->merge_bvec_fn(q, &bvm, bvec) < len) {
                        bvec->bv_page = NULL;
                        bvec->bv_len = 0;
                        bvec->bv_offset = 0;
@@ -1249,6 +1257,9 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
        bp->bio1.bi_private = bi;
        bp->bio2.bi_private = pool;
+        if (bio_integrity(bi))
+                bio_integrity_split(bi, bp, first_sectors);
        return bp;
 }
@@ -1290,6 +1301,7 @@ void bioset_free(struct bio_set *bs)
        if (bs->bio_pool)
                mempool_destroy(bs->bio_pool);
+        bioset_integrity_free(bs);
        biovec_free_pools(bs);
        kfree(bs);
@@ -1306,6 +1318,9 @@ struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size)
        if (!bs->bio_pool)
                goto bad;
+        if (bioset_integrity_create(bs, bio_pool_size))
+                goto bad;
        if (!biovec_create_pools(bs, bvec_pool_size))
                return bs;
@@ -1332,6 +1347,7 @@ static int __init init_bio(void)
 {
        bio_slab = KMEM_CACHE(bio, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+        bio_integrity_init_slab();
        biovec_init_slabs();
        fs_bio_set = bioset_create(BIO_POOL_SIZE, 2);
diff --git a/fs/buffer.c b/fs/buffer.c
index 0f51c0f7c266..d48caee12e2a 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1464,7 +1464,7 @@ static void invalidate_bh_lru(void *arg)
        
 void invalidate_bh_lrus(void)
 {
-        on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
+        on_each_cpu(invalidate_bh_lru, NULL, 1);
 }
 EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
@@ -1691,11 +1691,13 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
                         */
                        clear_buffer_dirty(bh);
                        set_buffer_uptodate(bh);
-                } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
+                } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
+                           buffer_dirty(bh)) {
                        WARN_ON(bh->b_size != blocksize);
                        err = get_block(inode, block, bh, 1);
                        if (err)
                                goto recover;
+                        clear_buffer_delay(bh);
                        if (buffer_new(bh)) {
                                /* blockdev mappings never come here */
                                clear_buffer_new(bh);
@@ -1774,7 +1776,8 @@ recover:
        bh = head;
        /* Recovery: lock and submit the mapped buffers */
        do {
-                if (buffer_mapped(bh) && buffer_dirty(bh)) {
+                if (buffer_mapped(bh) && buffer_dirty(bh) &&
+                    !buffer_delay(bh)) {
                        lock_buffer(bh);
                        mark_buffer_async_write(bh);
                } else {
@@ -2061,6 +2064,7 @@ int generic_write_end(struct file *file, struct address_space *mapping,
                        struct page *page, void *fsdata)
 {
        struct inode *inode = mapping->host;
+        int i_size_changed = 0;
        copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
@@ -2073,12 +2077,21 @@ int generic_write_end(struct file *file, struct address_space *mapping,
         */
        if (pos+copied > inode->i_size) {
                i_size_write(inode, pos+copied);
-                mark_inode_dirty(inode);
+                i_size_changed = 1;
        }
        unlock_page(page);
        page_cache_release(page);
+        /*
+         * Don't mark the inode dirty under page lock. First, it unnecessarily
+         * makes the holding time of page lock longer. Second, it forces lock
+         * ordering of page lock and transaction start for journaling
+         * filesystems.
+         */
+        if (i_size_changed)
+                mark_inode_dirty(inode);
        return copied;
 }
 EXPORT_SYMBOL(generic_write_end);
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 68e510b88457..3cb7cda3d780 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -373,6 +373,8 @@ static int chrdev_open(struct inode *inode, struct file *filp)
                        return -ENXIO;
                new = container_of(kobj, struct cdev, kobj);
                spin_lock(&cdev_lock);
+                /* Check i_cdev again in case somebody beat us to it while
+                   we dropped the lock. */
                p = inode->i_cdev;
                if (!p) {
                        inode->i_cdev = p = new;
@@ -392,11 +394,8 @@ static int chrdev_open(struct inode *inode, struct file *filp)
                cdev_put(p);
                return -ENXIO;
        }
-        if (filp->f_op->open) {
+        if (filp->f_op->open)
-                lock_kernel();
                ret = filp->f_op->open(inode,filp);
-                unlock_kernel();
-        }
        if (ret)
                cdev_put(p);
        return ret;
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 34902cff5400..0e9fc2ba90ee 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -34,11 +34,11 @@
 static struct cifs_wksid wksidarr[NUM_WK_SIDS] = {
        {{1, 0, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0} }, "null user"},
        {{1, 1, {0, 0, 0, 0, 0, 1}, {0, 0, 0, 0, 0} }, "nobody"},
-        {{1, 1, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(11), 0, 0, 0, 0} }, "net-users"},
+        {{1, 1, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(11), 0, 0, 0, 0} }, "net-users"},
-        {{1, 1, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(18), 0, 0, 0, 0} }, "sys"},
+        {{1, 1, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(18), 0, 0, 0, 0} }, "sys"},
-        {{1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(32), cpu_to_le32(544), 0, 0, 0} }, "root"},
+        {{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(544), 0, 0, 0} }, "root"},
-        {{1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(32), cpu_to_le32(545), 0, 0, 0} }, "users"},
+        {{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(545), 0, 0, 0} }, "users"},
-        {{1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(32), cpu_to_le32(546), 0, 0, 0} }, "guest"} }
+        {{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(546), 0, 0, 0} }, "guest"} }
 ;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 86b4d5f405ae..22857c639df5 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -612,7 +612,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
                if (retval < 0)
                        return (loff_t)retval;
        }
-        return remote_llseek(file, offset, origin);
+        return generic_file_llseek_unlocked(file, offset, origin);
 }
 struct file_system_type cifs_fs_type = {
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 722be543ceec..2e904bd111c8 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -219,15 +219,15 @@ int cifs_get_inode_info_unix(struct inode **pinode,
        rc = CIFSSMBUnixQPathInfo(xid, pTcon, full_path, &find_data,
                                  cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
-        if (rc) {
+        if (rc == -EREMOTE && !is_dfs_referral) {
-                if (rc == -EREMOTE && !is_dfs_referral) {
+                is_dfs_referral = true;
-                        is_dfs_referral = true;
+                cFYI(DBG2, ("DFS ref"));
-                        cFYI(DBG2, ("DFS ref"));
+                /* for DFS, server does not give us real inode data */
-                        /* for DFS, server does not give us real inode data */
+                fill_fake_finddataunix(&find_data, sb);
-                        fill_fake_finddataunix(&find_data, sb);
+                rc = 0;
-                        rc = 0;
+        } else if (rc)
-                }
+                goto cgiiu_exit;
-        }
        num_of_bytes = le64_to_cpu(find_data.NumOfBytes);
        end_of_file = le64_to_cpu(find_data.EndOfFile);
@@ -236,7 +236,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
                *pinode = new_inode(sb);
                if (*pinode == NULL) {
                        rc = -ENOMEM;
-                goto cgiiu_exit;
+                        goto cgiiu_exit;
                }
                /* Is an i_ino of zero legal? */
                /* note ino incremented to unique num in new_inode */
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index ebbcf38fd33b..f976f303c196 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -15,6 +15,7 @@
 #include <linux/poll.h>
 #include <linux/signal.h>
 #include <linux/spinlock.h>
+#include <linux/smp_lock.h>
 #include <linux/dlm.h>
 #include <linux/dlm_device.h>
@@ -618,13 +619,17 @@ static int device_open(struct inode *inode, struct file *file)
        struct dlm_user_proc *proc;
        struct dlm_ls *ls;
+        lock_kernel();
        ls = dlm_find_lockspace_device(iminor(inode));
-        if (!ls)
+        if (!ls) {
+                unlock_kernel();
                return -ENOENT;
+        }
        proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL);
        if (!proc) {
                dlm_put_lockspace(ls);
+                unlock_kernel();
                return -ENOMEM;
        }
@@ -636,6 +641,7 @@ static int device_open(struct inode *inode, struct file *file)
        spin_lock_init(&proc->locks_spin);
        init_waitqueue_head(&proc->wait);
        file->private_data = proc;
+        unlock_kernel();
        return 0;
 }
@@ -870,6 +876,7 @@ static unsigned int device_poll(struct file *file, poll_table *wait)
 static int ctl_device_open(struct inode *inode, struct file *file)
 {
+        cycle_kernel_lock();
        file->private_data = NULL;
        return 0;
 }
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 2258b8f654a6..24749bf0668f 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -30,6 +30,7 @@
 #include <linux/security.h>
 #include <linux/compat.h>
 #include <linux/fs_stack.h>
+#include <linux/smp_lock.h>
 #include "ecryptfs_kernel.h"
 /**
@@ -277,9 +278,11 @@ static int ecryptfs_fasync(int fd, struct file *file, int flag)
        int rc = 0;
        struct file *lower_file = NULL;
+        lock_kernel();
        lower_file = ecryptfs_file_to_lower(file);
        if (lower_file->f_op && lower_file->f_op->fasync)
                rc = lower_file->f_op->fasync(fd, lower_file, flag);
+        unlock_kernel();
        return rc;
 }
diff --git a/fs/exec.c b/fs/exec.c
index da94a6f05df3..fd9234379e8d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -610,7 +610,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
        bprm->exec -= stack_shift;
        down_write(&mm->mmap_sem);
-        vm_flags = vma->vm_flags;
+        vm_flags = VM_STACK_FLAGS;
        /*
         * Adjust stack execute permissions; explicitly enable for
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 9cc80b9cc8d8..495ab21b9832 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -47,7 +47,7 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
                        ext4_group_t block_group)
 {
        ext4_group_t actual_group;
-        ext4_get_group_no_and_offset(sb, block, &actual_group, 0);
+        ext4_get_group_no_and_offset(sb, block, &actual_group, NULL);
        if (actual_group == block_group)
                return 1;
        return 0;
@@ -121,12 +121,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
                                le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
                }
        } else { /* For META_BG_BLOCK_GROUPS */
-                int group_rel = (block_group -
+                bit_max += ext4_bg_num_gdb(sb, block_group);
-                                 le32_to_cpu(sbi->s_es->s_first_meta_bg)) %
-                                EXT4_DESC_PER_BLOCK(sb);
-                if (group_rel == 0 || group_rel == 1 ||
-                    (group_rel == EXT4_DESC_PER_BLOCK(sb) - 1))
-                        bit_max += 1;
        }
        if (block_group == sbi->s_groups_count - 1) {
@@ -295,7 +290,7 @@ err_out:
        return 0;
 }
 /**
- * read_block_bitmap()
+ * ext4_read_block_bitmap()
 * @sb:                 super block
 * @block_group:        given block group
 *
@@ -305,7 +300,7 @@ err_out:
 * Return buffer_head on success or NULL in case of failure.
 */
 struct buffer_head *
-read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
+ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 {
        struct ext4_group_desc * desc;
        struct buffer_head * bh = NULL;
@@ -409,8 +404,7 @@ restart:
                prev = rsv;
        }
        printk("Window map complete.\n");
-        if (bad)
+        BUG_ON(bad);
-                BUG();
 }
 #define rsv_window_dump(root, verbose) \
        __rsv_window_dump((root), (verbose), __func__)
@@ -694,7 +688,7 @@ do_more:
                count -= overflow;
        }
        brelse(bitmap_bh);
-        bitmap_bh = read_block_bitmap(sb, block_group);
+        bitmap_bh = ext4_read_block_bitmap(sb, block_group);
        if (!bitmap_bh)
                goto error_return;
        desc = ext4_get_group_desc (sb, block_group, &gd_bh);
@@ -810,6 +804,13 @@ do_more:
        spin_unlock(sb_bgl_lock(sbi, block_group));
        percpu_counter_add(&sbi->s_freeblocks_counter, count);
+        if (sbi->s_log_groups_per_flex) {
+                ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+                spin_lock(sb_bgl_lock(sbi, flex_group));
+                sbi->s_flex_groups[flex_group].free_blocks += count;
+                spin_unlock(sb_bgl_lock(sbi, flex_group));
+        }
        /* We dirtied the bitmap block */
        BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
        err = ext4_journal_dirty_metadata(handle, bitmap_bh);
@@ -1598,23 +1599,35 @@ out:
 /**
 * ext4_has_free_blocks()
- * @sbi:                in-core super block structure.
+ * @sbi:        in-core super block structure.
+ * @nblocks:    number of neeed blocks
 *
- * Check if filesystem has at least 1 free block available for allocation.
+ * Check if filesystem has free blocks available for allocation.
+ * Return the number of blocks avaible for allocation for this request
+ * On success, return nblocks
 */
-static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
+ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
+                                                ext4_fsblk_t nblocks)
 {
-        ext4_fsblk_t free_blocks, root_blocks;
+        ext4_fsblk_t free_blocks;
+        ext4_fsblk_t root_blocks = 0;
        free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
-        root_blocks = ext4_r_blocks_count(sbi->s_es);
-        if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
+        if (!capable(CAP_SYS_RESOURCE) &&
                sbi->s_resuid != current->fsuid &&
-                (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
+                (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
-                return 0;
+                root_blocks = ext4_r_blocks_count(sbi->s_es);
-        }
+#ifdef CONFIG_SMP
-        return 1;
+        if (free_blocks - root_blocks < FBC_BATCH)
-}
+                free_blocks =
+                        percpu_counter_sum_and_set(&sbi->s_freeblocks_counter);
+#endif
+        if (free_blocks - root_blocks < nblocks)
+                return free_blocks - root_blocks;
+        return nblocks;
+ }
 /**
 * ext4_should_retry_alloc()
@@ -1630,7 +1643,7 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
 */
 int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 {
-        if (!ext4_has_free_blocks(EXT4_SB(sb)) || (*retries)++ > 3)
+        if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || (*retries)++ > 3)
                return 0;
        jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
@@ -1639,20 +1652,24 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 }
 /**
- * ext4_new_blocks_old() -- core block(s) allocation function
+ * ext4_old_new_blocks() -- core block bitmap based block allocation function
+ *
 * @handle:             handle to this transaction
 * @inode:              file inode
 * @goal:               given target block(filesystem wide)
 * @count:              target number of blocks to allocate
 * @errp:               error code
 *
- * ext4_new_blocks uses a goal block to assist allocation.  It tries to
+ * ext4_old_new_blocks uses a goal block to assist allocation and look up
- * allocate block(s) from the block group contains the goal block first. If that
+ * the block bitmap directly to do block allocation.  It tries to
- * fails, it will try to allocate block(s) from other block groups without
+ * allocate block(s) from the block group contains the goal block first. If
- * any specific goal block.
+ * that fails, it will try to allocate block(s) from other block groups
+ * without any specific goal block.
+ *
+ * This function is called when -o nomballoc mount option is enabled
 *
 */
-ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
+ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
                        ext4_fsblk_t goal, unsigned long *count, int *errp)
 {
        struct buffer_head *bitmap_bh = NULL;
@@ -1676,13 +1693,26 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
        ext4_group_t ngroups;
        unsigned long num = *count;
-        *errp = -ENOSPC;
        sb = inode->i_sb;
        if (!sb) {
+                *errp = -ENODEV;
                printk("ext4_new_block: nonexistent device");
                return 0;
        }
+        sbi = EXT4_SB(sb);
+        if (!EXT4_I(inode)->i_delalloc_reserved_flag) {
+                /*
+                 * With delalloc we already reserved the blocks
+                 */
+                *count = ext4_has_free_blocks(sbi, *count);
+        }
+        if (*count == 0) {
+                *errp = -ENOSPC;
+                return 0;       /*return with ENOSPC error */
+        }
+        num = *count;
        /*
         * Check quota for allocation of this block.
         */
@@ -1706,11 +1736,6 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
        if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
                my_rsv = &block_i->rsv_window_node;
-        if (!ext4_has_free_blocks(sbi)) {
-                *errp = -ENOSPC;
-                goto out;
-        }
        /*
         * First, test whether the goal block is free.
         */
@@ -1734,7 +1759,7 @@ retry_alloc:
                my_rsv = NULL;
        if (free_blocks > 0) {
-                bitmap_bh = read_block_bitmap(sb, group_no);
+                bitmap_bh = ext4_read_block_bitmap(sb, group_no);
                if (!bitmap_bh)
                        goto io_error;
                grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
@@ -1770,7 +1795,7 @@ retry_alloc:
                        continue;
                brelse(bitmap_bh);
-                bitmap_bh = read_block_bitmap(sb, group_no);
+                bitmap_bh = ext4_read_block_bitmap(sb, group_no);
                if (!bitmap_bh)
                        goto io_error;
                /*
@@ -1882,7 +1907,15 @@ allocated:
        le16_add_cpu(&gdp->bg_free_blocks_count, -num);
        gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
        spin_unlock(sb_bgl_lock(sbi, group_no));
-        percpu_counter_sub(&sbi->s_freeblocks_counter, num);
+        if (!EXT4_I(inode)->i_delalloc_reserved_flag)
+                percpu_counter_sub(&sbi->s_freeblocks_counter, num);
+        if (sbi->s_log_groups_per_flex) {
+                ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
+                spin_lock(sb_bgl_lock(sbi, flex_group));
+                sbi->s_flex_groups[flex_group].free_blocks -= num;
+                spin_unlock(sb_bgl_lock(sbi, flex_group));
+        }
        BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
        err = ext4_journal_dirty_metadata(handle, gdp_bh);
@@ -1915,46 +1948,104 @@ out:
        return 0;
 }
-ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode,
+#define EXT4_META_BLOCK 0x1
-                ext4_fsblk_t goal, int *errp)
+static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
+                                ext4_lblk_t iblock, ext4_fsblk_t goal,
+                                unsigned long *count, int *errp, int flags)
 {
        struct ext4_allocation_request ar;
        ext4_fsblk_t ret;
        if (!test_opt(inode->i_sb, MBALLOC)) {
-                unsigned long count = 1;
+                return ext4_old_new_blocks(handle, inode, goal, count, errp);
-                ret = ext4_new_blocks_old(handle, inode, goal, &count, errp);
-                return ret;
        }
        memset(&ar, 0, sizeof(ar));
+        /* Fill with neighbour allocated blocks */
        ar.inode = inode;
        ar.goal = goal;
-        ar.len = 1;
+        ar.len = *count;
+        ar.logical = iblock;
+        if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK))
+                /* enable in-core preallocation for data block allocation */
+                ar.flags = EXT4_MB_HINT_DATA;
+        else
+                /* disable in-core preallocation for non-regular files */
+                ar.flags = 0;
        ret = ext4_mb_new_blocks(handle, &ar, errp);
+        *count = ar.len;
        return ret;
 }
-ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+/*
+ * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
+ *
+ * @handle:             handle to this transaction
+ * @inode:              file inode
+ * @goal:               given target block(filesystem wide)
+ * @count:              total number of blocks need
+ * @errp:               error code
+ *
+ * Return 1st allocated block numberon success, *count stores total account
+ * error stores in errp pointer
+ */
+ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
                ext4_fsblk_t goal, unsigned long *count, int *errp)
 {
-        struct ext4_allocation_request ar;
        ext4_fsblk_t ret;
+        ret = do_blk_alloc(handle, inode, 0, goal,
-        if (!test_opt(inode->i_sb, MBALLOC)) {
+                                count, errp, EXT4_META_BLOCK);
-                ret = ext4_new_blocks_old(handle, inode, goal, count, errp);
+        /*
-                return ret;
+         * Account for the allocated meta blocks
+         */
+        if (!(*errp)) {
+                spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+                EXT4_I(inode)->i_allocated_meta_blocks += *count;
+                spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
        }
-        memset(&ar, 0, sizeof(ar));
-        ar.inode = inode;
-        ar.goal = goal;
-        ar.len = *count;
-        ret = ext4_mb_new_blocks(handle, &ar, errp);
-        *count = ar.len;
        return ret;
 }
+/*
+ * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks
+ *
+ * @handle:             handle to this transaction
+ * @inode:              file inode
+ * @goal:               given target block(filesystem wide)
+ * @errp:               error code
+ *
+ * Return allocated block number on success
+ */
+ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
+                ext4_fsblk_t goal, int *errp)
+{
+        unsigned long count = 1;
+        return ext4_new_meta_blocks(handle, inode, goal, &count, errp);
+}
+/*
+ * ext4_new_blocks() -- allocate data blocks
+ *
+ * @handle:             handle to this transaction
+ * @inode:              file inode
+ * @goal:               given target block(filesystem wide)
+ * @count:              total number of blocks need
+ * @errp:               error code
+ *
+ * Return 1st allocated block numberon success, *count stores total account
+ * error stores in errp pointer
+ */
+ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+                                ext4_lblk_t iblock, ext4_fsblk_t goal,
+                                unsigned long *count, int *errp)
+{
+        return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0);
+}
 /**
 * ext4_count_free_blocks() -- count filesystem free blocks
@@ -1986,7 +2077,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
                        continue;
                desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
                brelse(bitmap_bh);
-                bitmap_bh = read_block_bitmap(sb, i);
+                bitmap_bh = ext4_read_block_bitmap(sb, i);
                if (bitmap_bh == NULL)
                        continue;
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 2bf0331ea194..d3d23d73c08b 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -129,7 +129,8 @@ static int ext4_readdir(struct file * filp,
                struct buffer_head *bh = NULL;
                map_bh.b_state = 0;
-                err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, 0, 0);
+                err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh,
+                                                0, 0, 0);
                if (err > 0) {
                        pgoff_t index = map_bh.b_blocknr >>
                                        (PAGE_CACHE_SHIFT - inode->i_blkbits);
@@ -272,7 +273,7 @@ static void free_rb_tree_fname(struct rb_root *root)
        while (n) {
                /* Do the node's children first */
-                if ((n)->rb_left) {
+                if (n->rb_left) {
                        n = n->rb_left;
                        continue;
                }
@@ -301,24 +302,18 @@ static void free_rb_tree_fname(struct rb_root *root)
                        parent->rb_right = NULL;
                n = parent;
        }
-        root->rb_node = NULL;
 }
-static struct dir_private_info *create_dir_info(loff_t pos)
+static struct dir_private_info *ext4_htree_create_dir_info(loff_t pos)
 {
        struct dir_private_info *p;
-        p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL);
+        p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
        if (!p)
                return NULL;
-        p->root.rb_node = NULL;
-        p->curr_node = NULL;
-        p->extra_fname = NULL;
-        p->last_pos = 0;
        p->curr_hash = pos2maj_hash(pos);
        p->curr_minor_hash = pos2min_hash(pos);
-        p->next_hash = 0;
        return p;
 }
@@ -433,7 +428,7 @@ static int ext4_dx_readdir(struct file * filp,
        int     ret;
        if (!info) {
-                info = create_dir_info(filp->f_pos);
+                info = ext4_htree_create_dir_info(filp->f_pos);
                if (!info)
                        return -ENOMEM;
                filp->private_data = info;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8158083f7ac0..303e41cf7b14 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -22,7 +22,7 @@
 #include "ext4_i.h"
 /*
- * The second extended filesystem constants/structures
+ * The fourth extended filesystem constants/structures
 */
 /*
@@ -45,7 +45,7 @@
 #define ext4_debug(f, a...)                                             \
        do {                                                            \
                printk (KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:",       \
-                        __FILE__, __LINE__, __FUNCTION__);              \
+                        __FILE__, __LINE__, __func__);                  \
                printk (KERN_DEBUG f, ## a);                            \
        } while (0)
 #else
@@ -74,6 +74,9 @@
 #define EXT4_MB_HINT_GOAL_ONLY          256
 /* goal is meaningful */
 #define EXT4_MB_HINT_TRY_GOAL           512
+/* blocks already pre-reserved by delayed allocation */
+#define EXT4_MB_DELALLOC_RESERVED      1024
 struct ext4_allocation_request {
        /* target inode for block we're allocating */
@@ -170,6 +173,15 @@ struct ext4_group_desc
        __u32   bg_reserved2[3];
 };
+/*
+ * Structure of a flex block group info
+ */
+struct flex_groups {
+        __u32 free_inodes;
+        __u32 free_blocks;
+};
 #define EXT4_BG_INODE_UNINIT    0x0001 /* Inode table/bitmap not in use */
 #define EXT4_BG_BLOCK_UNINIT    0x0002 /* Block bitmap not in use */
 #define EXT4_BG_INODE_ZEROED    0x0004 /* On-disk itable initialized to zero */
@@ -527,6 +539,7 @@ do {									       \
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
 #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
 #define EXT4_MOUNT_MBALLOC              0x4000000 /* Buddy allocation support */
+#define EXT4_MOUNT_DELALLOC             0x8000000 /* Delalloc support */
 /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
 #define clear_opt(o, opt)               o &= ~EXT4_MOUNT_##opt
@@ -647,7 +660,10 @@ struct ext4_super_block {
        __le16  s_mmp_interval;         /* # seconds to wait in MMP checking */
        __le64  s_mmp_block;            /* Block for multi-mount protection */
        __le32  s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
-        __u32   s_reserved[163];        /* Padding to the end of the block */
+        __u8    s_log_groups_per_flex;  /* FLEX_BG group size */
+        __u8    s_reserved_char_pad2;
+        __le16  s_reserved_pad;
+        __u32   s_reserved[162];        /* Padding to the end of the block */
 };
 #ifdef __KERNEL__
@@ -958,12 +974,17 @@ extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
 extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
 extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
                        ext4_group_t group);
-extern ext4_fsblk_t ext4_new_block (handle_t *handle, struct inode *inode,
+extern ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
                        ext4_fsblk_t goal, int *errp);
-extern ext4_fsblk_t ext4_new_blocks (handle_t *handle, struct inode *inode,
+extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
                        ext4_fsblk_t goal, unsigned long *count, int *errp);
-extern ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
+extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+                                        ext4_lblk_t iblock, ext4_fsblk_t goal,
+                                        unsigned long *count, int *errp);
+extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
                        ext4_fsblk_t goal, unsigned long *count, int *errp);
+extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
+                                                ext4_fsblk_t nblocks);
 extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
                        ext4_fsblk_t block, unsigned long count, int metadata);
 extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb,
@@ -1016,9 +1037,14 @@ extern int __init init_ext4_mballoc(void);
 extern void exit_ext4_mballoc(void);
 extern void ext4_mb_free_blocks(handle_t *, struct inode *,
                unsigned long, unsigned long, int, unsigned long *);
+extern int ext4_mb_add_more_groupinfo(struct super_block *sb,
+                ext4_group_t i, struct ext4_group_desc *desc);
+extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
+                ext4_grpblk_t add);
 /* inode.c */
+void ext4_da_release_space(struct inode *inode, int used, int to_free);
 int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
                struct buffer_head *bh, ext4_fsblk_t blocknr);
 struct buffer_head *ext4_getblk(handle_t *, struct inode *,
@@ -1033,19 +1059,23 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
 extern struct inode *ext4_iget(struct super_block *, unsigned long);
 extern int  ext4_write_inode (struct inode *, int);
 extern int  ext4_setattr (struct dentry *, struct iattr *);
+extern int  ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
+                                struct kstat *stat);
 extern void ext4_delete_inode (struct inode *);
 extern int  ext4_sync_inode (handle_t *, struct inode *);
 extern void ext4_discard_reservation (struct inode *);
 extern void ext4_dirty_inode(struct inode *);
 extern int ext4_change_inode_journal_flag(struct inode *, int);
 extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
+extern int ext4_can_truncate(struct inode *inode);
 extern void ext4_truncate (struct inode *);
 extern void ext4_set_inode_flags(struct inode *);
 extern void ext4_get_inode_flags(struct ext4_inode_info *);
 extern void ext4_set_aops(struct inode *inode);
 extern int ext4_writepage_trans_blocks(struct inode *);
-extern int ext4_block_truncate_page(handle_t *handle, struct page *page,
+extern int ext4_block_truncate_page(handle_t *handle,
                struct address_space *mapping, loff_t from);
+extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
 /* ioctl.c */
 extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
@@ -1159,10 +1189,21 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
 }
+static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
+                                             ext4_group_t block_group)
+{
+        return block_group >> sbi->s_log_groups_per_flex;
+}
+static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
+{
+        return 1 << sbi->s_log_groups_per_flex;
+}
 #define ext4_std_error(sb, errno)                               \
 do {                                                            \
        if ((errno))                                            \
-                __ext4_std_error((sb), __FUNCTION__, (errno));  \
+                __ext4_std_error((sb), __func__, (errno));      \
 } while (0)
 /*
@@ -1191,7 +1232,7 @@ extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                        ext4_lblk_t iblock,
                        unsigned long max_blocks, struct buffer_head *bh_result,
                        int create, int extend_disksize);
-extern void ext4_ext_truncate(struct inode *, struct page *);
+extern void ext4_ext_truncate(struct inode *);
 extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
 extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
@@ -1199,7 +1240,7 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
 extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
                        sector_t block, unsigned long max_blocks,
                        struct buffer_head *bh, int create,
-                        int extend_disksize);
+                        int extend_disksize, int flag);
 #endif  /* __KERNEL__ */
 #endif  /* _EXT4_H */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 75333b595fab..6c166c0a54b7 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -212,6 +212,7 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
                (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
 }
+extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
 extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
 extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
 extern int ext4_extent_tree_init(handle_t *, struct inode *);
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index 26a4ae255d79..ef7409f0e7e4 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -79,7 +79,7 @@ struct ext4_ext_cache {
 };
 /*
- * third extended file system inode data in memory
+ * fourth extended file system inode data in memory
 */
 struct ext4_inode_info {
        __le32  i_data[15];     /* unconverted */
@@ -150,6 +150,7 @@ struct ext4_inode_info {
         */
        struct rw_semaphore i_data_sem;
        struct inode vfs_inode;
+        struct jbd2_inode jinode;
        unsigned long i_ext_generation;
        struct ext4_ext_cache i_cached_extent;
@@ -162,6 +163,13 @@ struct ext4_inode_info {
        /* mballoc */
        struct list_head i_prealloc_list;
        spinlock_t i_prealloc_lock;
+        /* allocation reservation info for delalloc */
+        unsigned long i_reserved_data_blocks;
+        unsigned long i_reserved_meta_blocks;
+        unsigned long i_allocated_meta_blocks;
+        unsigned short i_delalloc_reserved_flag;
+        spinlock_t i_block_reservation_lock;
 };
 #endif  /* _EXT4_I */
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 9255a7d28b24..eb8bc3afe6e9 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -142,19 +142,17 @@ int __ext4_journal_dirty_metadata(const char *where,
                                handle_t *handle, struct buffer_head *bh);
 #define ext4_journal_get_undo_access(handle, bh) \
-        __ext4_journal_get_undo_access(__FUNCTION__, (handle), (bh))
+        __ext4_journal_get_undo_access(__func__, (handle), (bh))
 #define ext4_journal_get_write_access(handle, bh) \
-        __ext4_journal_get_write_access(__FUNCTION__, (handle), (bh))
+        __ext4_journal_get_write_access(__func__, (handle), (bh))
 #define ext4_journal_revoke(handle, blocknr, bh) \
-        __ext4_journal_revoke(__FUNCTION__, (handle), (blocknr), (bh))
+        __ext4_journal_revoke(__func__, (handle), (blocknr), (bh))
 #define ext4_journal_get_create_access(handle, bh) \
-        __ext4_journal_get_create_access(__FUNCTION__, (handle), (bh))
+        __ext4_journal_get_create_access(__func__, (handle), (bh))
 #define ext4_journal_dirty_metadata(handle, bh) \
-        __ext4_journal_dirty_metadata(__FUNCTION__, (handle), (bh))
+        __ext4_journal_dirty_metadata(__func__, (handle), (bh))
 #define ext4_journal_forget(handle, bh) \
-        __ext4_journal_forget(__FUNCTION__, (handle), (bh))
+        __ext4_journal_forget(__func__, (handle), (bh))
-int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh);
 handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
 int __ext4_journal_stop(const char *where, handle_t *handle);
@@ -165,7 +163,7 @@ static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
 }
 #define ext4_journal_stop(handle) \
-        __ext4_journal_stop(__FUNCTION__, (handle))
+        __ext4_journal_stop(__func__, (handle))
 static inline handle_t *ext4_journal_current_handle(void)
 {
@@ -192,6 +190,11 @@ static inline int ext4_journal_force_commit(journal_t *journal)
        return jbd2_journal_force_commit(journal);
 }
+static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
+{
+        return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
+}
 /* super.c */
 int ext4_force_commit(struct super_block *sb);
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 5802e69f2191..6300226d5531 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -25,7 +25,7 @@
 #include <linux/rbtree.h>
 /*
- * third extended-fs super-block data in memory
+ * fourth extended-fs super-block data in memory
 */
 struct ext4_sb_info {
        unsigned long s_desc_size;      /* Size of a group descriptor in bytes */
@@ -143,6 +143,9 @@ struct ext4_sb_info {
        /* locality groups */
        struct ext4_locality_group *s_locality_groups;
+        unsigned int s_log_groups_per_flex;
+        struct flex_groups *s_flex_groups;
 };
 #endif  /* _EXT4_SB */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 47929c4e3dae..42c4c0c892ed 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -92,17 +92,16 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
        ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
 }
-static handle_t *ext4_ext_journal_restart(handle_t *handle, int needed)
+static int ext4_ext_journal_restart(handle_t *handle, int needed)
 {
        int err;
        if (handle->h_buffer_credits > needed)
-                return handle;
+                return 0;
-        if (!ext4_journal_extend(handle, needed))
+        err = ext4_journal_extend(handle, needed);
-                return handle;
+        if (err)
-        err = ext4_journal_restart(handle, needed);
+                return err;
+        return ext4_journal_restart(handle, needed);
-        return handle;
 }
 /*
@@ -180,15 +179,18 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
        return bg_start + colour + block;
 }
+/*
+ * Allocation for a meta data block
+ */
 static ext4_fsblk_t
-ext4_ext_new_block(handle_t *handle, struct inode *inode,
+ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
                        struct ext4_ext_path *path,
                        struct ext4_extent *ex, int *err)
 {
        ext4_fsblk_t goal, newblock;
        goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
-        newblock = ext4_new_block(handle, inode, goal, err);
+        newblock = ext4_new_meta_block(handle, inode, goal, err);
        return newblock;
 }
@@ -246,6 +248,36 @@ static int ext4_ext_space_root_idx(struct inode *inode)
        return size;
 }
+/*
+ * Calculate the number of metadata blocks needed
+ * to allocate @blocks
+ * Worse case is one block per extent
+ */
+int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
+{
+        int lcap, icap, rcap, leafs, idxs, num;
+        int newextents = blocks;
+        rcap = ext4_ext_space_root_idx(inode);
+        lcap = ext4_ext_space_block(inode);
+        icap = ext4_ext_space_block_idx(inode);
+        /* number of new leaf blocks needed */
+        num = leafs = (newextents + lcap - 1) / lcap;
+        /*
+         * Worse case, we need separate index block(s)
+         * to link all new leaf blocks
+         */
+        idxs = (leafs + icap - 1) / icap;
+        do {
+                num += idxs;
+                idxs = (idxs + icap - 1) / icap;
+        } while (idxs > rcap);
+        return num;
+}
 static int
 ext4_ext_max_entries(struct inode *inode, int depth)
 {
@@ -524,6 +556,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
                alloc = 1;
        }
        path[0].p_hdr = eh;
+        path[0].p_bh = NULL;
        i = depth;
        /* walk through the tree */
@@ -552,12 +585,14 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
        }
        path[ppos].p_depth = i;
-        path[ppos].p_hdr = eh;
        path[ppos].p_ext = NULL;
        path[ppos].p_idx = NULL;
        /* find extent */
        ext4_ext_binsearch(inode, path + ppos, block);
+        /* if not an empty leaf */
+        if (path[ppos].p_ext)
+                path[ppos].p_block = ext_pblock(path[ppos].p_ext);
        ext4_ext_show_path(inode, path);
@@ -688,7 +723,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
        /* allocate all needed blocks */
        ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
        for (a = 0; a < depth - at; a++) {
-                newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
+                newblock = ext4_ext_new_meta_block(handle, inode, path,
+                                                   newext, &err);
                if (newblock == 0)
                        goto cleanup;
                ablocks[a] = newblock;
@@ -884,7 +920,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
        ext4_fsblk_t newblock;
        int err = 0;
-        newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
+        newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err);
        if (newblock == 0)
                return err;
@@ -981,6 +1017,8 @@ repeat:
                /* if we found index with free entry, then use that
                 * entry: create all needed subtree and add new leaf */
                err = ext4_ext_split(handle, inode, path, newext, i);
+                if (err)
+                        goto out;
                /* refill path */
                ext4_ext_drop_refs(path);
@@ -1883,11 +1921,9 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
 #endif
-                handle = ext4_ext_journal_restart(handle, credits);
+                err = ext4_ext_journal_restart(handle, credits);
-                if (IS_ERR(handle)) {
+                if (err)
-                        err = PTR_ERR(handle);
                        goto out;
-                }
                err = ext4_ext_get_access(handle, inode, path + depth);
                if (err)
@@ -2529,6 +2565,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        int err = 0, depth, ret;
        unsigned long allocated = 0;
        struct ext4_allocation_request ar;
+        loff_t disksize;
        __clear_bit(BH_New, &bh_result->b_state);
        ext_debug("blocks %u/%lu requested for inode %u\n",
@@ -2616,8 +2653,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                                 */
                                if (allocated > max_blocks)
                                        allocated = max_blocks;
-                                /* mark the buffer unwritten */
+                                set_buffer_unwritten(bh_result);
-                                __set_bit(BH_Unwritten, &bh_result->b_state);
                                goto out2;
                        }
@@ -2716,14 +2752,19 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                goto out2;
        }
-        if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize)
-                EXT4_I(inode)->i_disksize = inode->i_size;
        /* previous routine could use block we allocated */
        newblock = ext_pblock(&newex);
        allocated = ext4_ext_get_actual_len(&newex);
 outnew:
-        __set_bit(BH_New, &bh_result->b_state);
+        if (extend_disksize) {
+                disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits;
+                if (disksize > i_size_read(inode))
+                        disksize = i_size_read(inode);
+                if (disksize > EXT4_I(inode)->i_disksize)
+                        EXT4_I(inode)->i_disksize = disksize;
+        }
+        set_buffer_new(bh_result);
        /* Cache only when it is _not_ an uninitialized extent */
        if (create != EXT4_CREATE_UNINITIALIZED_EXT)
@@ -2733,7 +2774,7 @@ out:
        if (allocated > max_blocks)
                allocated = max_blocks;
        ext4_ext_show_leaf(inode, path);
-        __set_bit(BH_Mapped, &bh_result->b_state);
+        set_buffer_mapped(bh_result);
        bh_result->b_bdev = inode->i_sb->s_bdev;
        bh_result->b_blocknr = newblock;
 out2:
@@ -2744,7 +2785,7 @@ out2:
        return err ? err : allocated;
 }
-void ext4_ext_truncate(struct inode * inode, struct page *page)
+void ext4_ext_truncate(struct inode *inode)
 {
        struct address_space *mapping = inode->i_mapping;
        struct super_block *sb = inode->i_sb;
@@ -2757,18 +2798,14 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
         */
        err = ext4_writepage_trans_blocks(inode) + 3;
        handle = ext4_journal_start(inode, err);
-        if (IS_ERR(handle)) {
+        if (IS_ERR(handle))
-                if (page) {
-                        clear_highpage(page);
-                        flush_dcache_page(page);
-                        unlock_page(page);
-                        page_cache_release(page);
-                }
                return;
-        }
-        if (page)
+        if (inode->i_size & (sb->s_blocksize - 1))
-                ext4_block_truncate_page(handle, page, mapping, inode->i_size);
+                ext4_block_truncate_page(handle, mapping, inode->i_size);
+        if (ext4_orphan_add(handle, inode))
+                goto out_stop;
        down_write(&EXT4_I(inode)->i_data_sem);
        ext4_ext_invalidate_cache(inode);
@@ -2780,8 +2817,6 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
         * Probably we need not scan at all,
         * because page truncation is enough.
         */
-        if (ext4_orphan_add(handle, inode))
-                goto out_stop;
        /* we have to know where to truncate from in crash case */
        EXT4_I(inode)->i_disksize = inode->i_size;
@@ -2798,6 +2833,7 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
                handle->h_sync = 1;
 out_stop:
+        up_write(&EXT4_I(inode)->i_data_sem);
        /*
         * If this was a simple ftruncate() and the file will remain alive,
         * then we need to clear up the orphan record which we created above.
@@ -2808,7 +2844,6 @@ out_stop:
        if (inode->i_nlink)
                ext4_orphan_del(handle, inode);
-        up_write(&EXT4_I(inode)->i_data_sem);
        inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
        ext4_mark_inode_dirty(handle, inode);
        ext4_journal_stop(handle);
@@ -2911,7 +2946,7 @@ retry:
                }
                ret = ext4_get_blocks_wrap(handle, inode, block,
                                          max_blocks, &map_bh,
-                                          EXT4_CREATE_UNINITIALIZED_EXT, 0);
+                                          EXT4_CREATE_UNINITIALIZED_EXT, 0, 0);
                if (ret <= 0) {
 #ifdef EXT4FS_DEBUG
                        WARN_ON(ret <= 0);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 4159be6366ab..430eb7978db4 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -123,6 +123,23 @@ force_commit:
        return ret;
 }
+static struct vm_operations_struct ext4_file_vm_ops = {
+        .fault          = filemap_fault,
+        .page_mkwrite   = ext4_page_mkwrite,
+};
+static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+        struct address_space *mapping = file->f_mapping;
+        if (!mapping->a_ops->readpage)
+                return -ENOEXEC;
+        file_accessed(file);
+        vma->vm_ops = &ext4_file_vm_ops;
+        vma->vm_flags |= VM_CAN_NONLINEAR;
+        return 0;
+}
 const struct file_operations ext4_file_operations = {
        .llseek         = generic_file_llseek,
        .read           = do_sync_read,
@@ -133,7 +150,7 @@ const struct file_operations ext4_file_operations = {
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ext4_compat_ioctl,
 #endif
-        .mmap           = generic_file_mmap,
+        .mmap           = ext4_file_mmap,
        .open           = generic_file_open,
        .release        = ext4_release_file,
        .fsync          = ext4_sync_file,
@@ -144,6 +161,7 @@ const struct file_operations ext4_file_operations = {
 const struct inode_operations ext4_file_inode_operations = {
        .truncate       = ext4_truncate,
        .setattr        = ext4_setattr,
+        .getattr        = ext4_getattr,
 #ifdef CONFIG_EXT4DEV_FS_XATTR
        .setxattr       = generic_setxattr,
        .getxattr       = generic_getxattr,
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 1c8ba48d4f8d..a45c3737ad31 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -27,6 +27,7 @@
 #include <linux/sched.h>
 #include <linux/writeback.h>
 #include <linux/jbd2.h>
+#include <linux/blkdev.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
@@ -45,6 +46,7 @@
 int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
 {
        struct inode *inode = dentry->d_inode;
+        journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
        int ret = 0;
        J_ASSERT(ext4_journal_current_handle() == NULL);
@@ -85,6 +87,8 @@ int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
                        .nr_to_write = 0, /* sys_fsync did this */
                };
                ret = sync_inode(inode, &wbc);
+                if (journal && (journal->j_flags & JBD2_BARRIER))
+                        blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
        }
 out:
        return ret;
diff --git a/fs/ext4/group.h b/fs/ext4/group.h
index 7eb0604e7eea..c2c0a8d06d0e 100644
--- a/fs/ext4/group.h
+++ b/fs/ext4/group.h
@@ -13,7 +13,7 @@ extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
                                   struct ext4_group_desc *gdp);
 extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
                                       struct ext4_group_desc *gdp);
-struct buffer_head *read_block_bitmap(struct super_block *sb,
+struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
                                      ext4_group_t block_group);
 extern unsigned ext4_init_block_bitmap(struct super_block *sb,
                                       struct buffer_head *bh,
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index c6efbab0c801..a92eb305344f 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -157,6 +157,7 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
        struct ext4_super_block * es;
        struct ext4_sb_info *sbi;
        int fatal = 0, err;
+        ext4_group_t flex_group;
        if (atomic_read(&inode->i_count) > 1) {
                printk ("ext4_free_inode: inode has count=%d\n",
@@ -232,6 +233,12 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
                        if (is_directory)
                                percpu_counter_dec(&sbi->s_dirs_counter);
+                        if (sbi->s_log_groups_per_flex) {
+                                flex_group = ext4_flex_group(sbi, block_group);
+                                spin_lock(sb_bgl_lock(sbi, flex_group));
+                                sbi->s_flex_groups[flex_group].free_inodes++;
+                                spin_unlock(sb_bgl_lock(sbi, flex_group));
+                        }
                }
                BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
                err = ext4_journal_dirty_metadata(handle, bh2);
@@ -286,6 +293,80 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
        return ret;
 }
+#define free_block_ratio 10
+static int find_group_flex(struct super_block *sb, struct inode *parent,
+                           ext4_group_t *best_group)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_group_desc *desc;
+        struct buffer_head *bh;
+        struct flex_groups *flex_group = sbi->s_flex_groups;
+        ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
+        ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group);
+        ext4_group_t ngroups = sbi->s_groups_count;
+        int flex_size = ext4_flex_bg_size(sbi);
+        ext4_group_t best_flex = parent_fbg_group;
+        int blocks_per_flex = sbi->s_blocks_per_group * flex_size;
+        int flexbg_free_blocks;
+        int flex_freeb_ratio;
+        ext4_group_t n_fbg_groups;
+        ext4_group_t i;
+        n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >>
+                sbi->s_log_groups_per_flex;
+find_close_to_parent:
+        flexbg_free_blocks = flex_group[best_flex].free_blocks;
+        flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
+        if (flex_group[best_flex].free_inodes &&
+            flex_freeb_ratio > free_block_ratio)
+                goto found_flexbg;
+        if (best_flex && best_flex == parent_fbg_group) {
+                best_flex--;
+                goto find_close_to_parent;
+        }
+        for (i = 0; i < n_fbg_groups; i++) {
+                if (i == parent_fbg_group || i == parent_fbg_group - 1)
+                        continue;
+                flexbg_free_blocks = flex_group[i].free_blocks;
+                flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
+                if (flex_freeb_ratio > free_block_ratio &&
+                    flex_group[i].free_inodes) {
+                        best_flex = i;
+                        goto found_flexbg;
+                }
+                if (best_flex < 0 ||
+                    (flex_group[i].free_blocks >
+                     flex_group[best_flex].free_blocks &&
+                     flex_group[i].free_inodes))
+                        best_flex = i;
+        }
+        if (!flex_group[best_flex].free_inodes ||
+            !flex_group[best_flex].free_blocks)
+                return -1;
+found_flexbg:
+        for (i = best_flex * flex_size; i < ngroups &&
+                     i < (best_flex + 1) * flex_size; i++) {
+                desc = ext4_get_group_desc(sb, i, &bh);
+                if (le16_to_cpu(desc->bg_free_inodes_count)) {
+                        *best_group = i;
+                        goto out;
+                }
+        }
+        return -1;
+out:
+        return 0;
+}
 /*
 * Orlov's allocator for directories.
 *
@@ -501,6 +582,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
        struct inode *ret;
        ext4_group_t i;
        int free = 0;
+        ext4_group_t flex_group;
        /* Cannot create files in a deleted directory */
        if (!dir || !dir->i_nlink)
@@ -514,6 +596,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
        sbi = EXT4_SB(sb);
        es = sbi->s_es;
+        if (sbi->s_log_groups_per_flex) {
+                ret2 = find_group_flex(sb, dir, &group);
+                goto got_group;
+        }
        if (S_ISDIR(mode)) {
                if (test_opt (sb, OLDALLOC))
                        ret2 = find_group_dir(sb, dir, &group);
@@ -522,6 +610,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
        } else
                ret2 = find_group_other(sb, dir, &group);
+got_group:
        err = -ENOSPC;
        if (ret2 == -1)
                goto out;
@@ -600,7 +689,7 @@ got:
        /* We may have to initialize the block bitmap if it isn't already */
        if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
            gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
-                struct buffer_head *block_bh = read_block_bitmap(sb, group);
+                struct buffer_head *block_bh = ext4_read_block_bitmap(sb, group);
                BUFFER_TRACE(block_bh, "get block bitmap access");
                err = ext4_journal_get_write_access(handle, block_bh);
@@ -676,6 +765,13 @@ got:
                percpu_counter_inc(&sbi->s_dirs_counter);
        sb->s_dirt = 1;
+        if (sbi->s_log_groups_per_flex) {
+                flex_group = ext4_flex_group(sbi, group);
+                spin_lock(sb_bgl_lock(sbi, flex_group));
+                sbi->s_flex_groups[flex_group].free_inodes--;
+                spin_unlock(sb_bgl_lock(sbi, flex_group));
+        }
        inode->i_uid = current->fsuid;
        if (test_opt (sb, GRPID))
                inode->i_gid = dir->i_gid;
@@ -740,14 +836,10 @@ got:
                goto fail_free_drop;
        if (test_opt(sb, EXTENTS)) {
-                /* set extent flag only for diretory, file and normal symlink*/
+                /* set extent flag only for directory, file and normal symlink*/
                if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
                        EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
                        ext4_ext_tree_init(handle, inode);
-                        err = ext4_update_incompat_feature(handle, sb,
-                                        EXT4_FEATURE_INCOMPAT_EXTENTS);
-                        if (err)
-                                goto fail_free_drop;
                }
        }
@@ -817,6 +909,14 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
        if (IS_ERR(inode))
                goto iget_failed;
+        /*
+         * If the orphans has i_nlinks > 0 then it should be able to be
+         * truncated, otherwise it won't be removed from the orphan list
+         * during processing and an infinite loop will result.
+         */
+        if (inode->i_nlink && !ext4_can_truncate(inode))
+                goto bad_orphan;
        if (NEXT_ORPHAN(inode) > max_ino)
                goto bad_orphan;
        brelse(bitmap_bh);
@@ -838,6 +938,7 @@ bad_orphan:
                printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
                       NEXT_ORPHAN(inode));
                printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
+                printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink);
                /* Avoid freeing blocks if we got a bad deleted inode */
                if (inode->i_nlink == 0)
                        inode->i_blocks = 0;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8d9707746413..8ca2763df091 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -32,12 +32,23 @@
 #include <linux/string.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
+#include <linux/pagevec.h>
 #include <linux/mpage.h>
 #include <linux/uio.h>
 #include <linux/bio.h>
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
+#include "ext4_extents.h"
+static inline int ext4_begin_ordered_truncate(struct inode *inode,
+                                              loff_t new_size)
+{
+        return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode,
+                                                   new_size);
+}
+static void ext4_invalidatepage(struct page *page, unsigned long offset);
 /*
 * Test whether an inode is a fast symlink.
@@ -181,6 +192,8 @@ void ext4_delete_inode (struct inode * inode)
 {
        handle_t *handle;
+        if (ext4_should_order_data(inode))
+                ext4_begin_ordered_truncate(inode, 0);
        truncate_inode_pages(&inode->i_data, 0);
        if (is_bad_inode(inode))
@@ -508,11 +521,12 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
 *              direct blocks
 */
 static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
-                        ext4_fsblk_t goal, int indirect_blks, int blks,
+                                ext4_lblk_t iblock, ext4_fsblk_t goal,
-                        ext4_fsblk_t new_blocks[4], int *err)
+                                int indirect_blks, int blks,
+                                ext4_fsblk_t new_blocks[4], int *err)
 {
        int target, i;
-        unsigned long count = 0;
+        unsigned long count = 0, blk_allocated = 0;
        int index = 0;
        ext4_fsblk_t current_block = 0;
        int ret = 0;
@@ -525,12 +539,13 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
         * the first direct block of this branch.  That's the
         * minimum number of blocks need to allocate(required)
         */
-        target = blks + indirect_blks;
+        /* first we try to allocate the indirect blocks */
+        target = indirect_blks;
-        while (1) {
+        while (target > 0) {
                count = target;
                /* allocating blocks for indirect blocks and direct blocks */
-                current_block = ext4_new_blocks(handle,inode,goal,&count,err);
+                current_block = ext4_new_meta_blocks(handle, inode,
+                                                        goal, &count, err);
                if (*err)
                        goto failed_out;
@@ -540,16 +555,48 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
                        new_blocks[index++] = current_block++;
                        count--;
                }
+                if (count > 0) {
-                if (count > 0)
+                        /*
+                         * save the new block number
+                         * for the first direct block
+                         */
+                        new_blocks[index] = current_block;
+                        printk(KERN_INFO "%s returned more blocks than "
+                                                "requested\n", __func__);
+                        WARN_ON(1);
                        break;
+                }
        }
-        /* save the new block number for the first direct block */
+        target = blks - count ;
-        new_blocks[index] = current_block;
+        blk_allocated = count;
+        if (!target)
+                goto allocated;
+        /* Now allocate data blocks */
+        count = target;
+        /* allocating blocks for data blocks */
+        current_block = ext4_new_blocks(handle, inode, iblock,
+                                                goal, &count, err);
+        if (*err && (target == blks)) {
+                /*
+                 * if the allocation failed and we didn't allocate
+                 * any blocks before
+                 */
+                goto failed_out;
+        }
+        if (!*err) {
+                if (target == blks) {
+                /*
+                 * save the new block number
+                 * for the first direct block
+                 */
+                        new_blocks[index] = current_block;
+                }
+                blk_allocated += count;
+        }
+allocated:
        /* total number of blocks allocated for direct blocks */
-        ret = count;
+        ret = blk_allocated;
        *err = 0;
        return ret;
 failed_out:
@@ -584,8 +631,9 @@ failed_out:
 *      as described above and return 0.
 */
 static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
-                        int indirect_blks, int *blks, ext4_fsblk_t goal,
+                                ext4_lblk_t iblock, int indirect_blks,
-                        ext4_lblk_t *offsets, Indirect *branch)
+                                int *blks, ext4_fsblk_t goal,
+                                ext4_lblk_t *offsets, Indirect *branch)
 {
        int blocksize = inode->i_sb->s_blocksize;
        int i, n = 0;
@@ -595,7 +643,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
        ext4_fsblk_t new_blocks[4];
        ext4_fsblk_t current_block;
-        num = ext4_alloc_blocks(handle, inode, goal, indirect_blks,
+        num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
                                *blks, new_blocks, &err);
        if (err)
                return err;
@@ -799,6 +847,7 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
        struct ext4_inode_info *ei = EXT4_I(inode);
        int count = 0;
        ext4_fsblk_t first_block = 0;
+        loff_t disksize;
        J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
@@ -855,8 +904,9 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
        /*
         * Block out ext4_truncate while we alter the tree
         */
-        err = ext4_alloc_branch(handle, inode, indirect_blks, &count, goal,
+        err = ext4_alloc_branch(handle, inode, iblock, indirect_blks,
-                                offsets + (partial - chain), partial);
+                                        &count, goal,
+                                        offsets + (partial - chain), partial);
        /*
         * The ext4_splice_branch call will free and forget any buffers
@@ -873,8 +923,13 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
         * protect it if you're about to implement concurrent
         * ext4_get_block() -bzzz
        */
-        if (!err && extend_disksize && inode->i_size > ei->i_disksize)
+        if (!err && extend_disksize) {
-                ei->i_disksize = inode->i_size;
+                disksize = ((loff_t) iblock + count) << inode->i_blkbits;
+                if (disksize > i_size_read(inode))
+                        disksize = i_size_read(inode);
+                if (disksize > ei->i_disksize)
+                        ei->i_disksize = disksize;
+        }
        if (err)
                goto cleanup;
@@ -934,7 +989,7 @@ out:
 */
 int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
                        unsigned long max_blocks, struct buffer_head *bh,
-                        int create, int extend_disksize)
+                        int create, int extend_disksize, int flag)
 {
        int retval;
@@ -975,6 +1030,15 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
         * with create == 1 flag.
         */
        down_write((&EXT4_I(inode)->i_data_sem));
+        /*
+         * if the caller is from delayed allocation writeout path
+         * we have already reserved fs blocks for allocation
+         * let the underlying get_block() function know to
+         * avoid double accounting
+         */
+        if (flag)
+                EXT4_I(inode)->i_delalloc_reserved_flag = 1;
        /*
         * We need to check for EXT4 here because migrate
         * could have changed the inode type in between
@@ -996,6 +1060,18 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
                                                        ~EXT4_EXT_MIGRATE;
                }
        }
+        if (flag) {
+                EXT4_I(inode)->i_delalloc_reserved_flag = 0;
+                /*
+                 * Update reserved blocks/metadata blocks
+                 * after successful block allocation
+                 * which were deferred till now
+                 */
+                if ((retval > 0) && buffer_delay(bh))
+                        ext4_da_release_space(inode, retval, 0);
+        }
        up_write((&EXT4_I(inode)->i_data_sem));
        return retval;
 }
@@ -1021,7 +1097,7 @@ static int ext4_get_block(struct inode *inode, sector_t iblock,
        }
        ret = ext4_get_blocks_wrap(handle, inode, iblock,
-                                        max_blocks, bh_result, create, 0);
+                                        max_blocks, bh_result, create, 0, 0);
        if (ret > 0) {
                bh_result->b_size = (ret << inode->i_blkbits);
                ret = 0;
@@ -1047,7 +1123,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
        dummy.b_blocknr = -1000;
        buffer_trace_init(&dummy.b_history);
        err = ext4_get_blocks_wrap(handle, inode, block, 1,
-                                        &dummy, create, 1);
+                                        &dummy, create, 1, 0);
        /*
         * ext4_get_blocks_handle() returns number of blocks
         * mapped. 0 in case of a HOLE.
@@ -1203,19 +1279,20 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
        to = from + len;
 retry:
-        page = __grab_cache_page(mapping, index);
-        if (!page)
-                return -ENOMEM;
-        *pagep = page;
        handle = ext4_journal_start(inode, needed_blocks);
        if (IS_ERR(handle)) {
-                unlock_page(page);
-                page_cache_release(page);
                ret = PTR_ERR(handle);
                goto out;
        }
+        page = __grab_cache_page(mapping, index);
+        if (!page) {
+                ext4_journal_stop(handle);
+                ret = -ENOMEM;
+                goto out;
+        }
+        *pagep = page;
        ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
                                                        ext4_get_block);
@@ -1225,8 +1302,8 @@ retry:
        }
        if (ret) {
-                ext4_journal_stop(handle);
                unlock_page(page);
+                ext4_journal_stop(handle);
                page_cache_release(page);
        }
@@ -1236,15 +1313,6 @@ out:
        return ret;
 }
-int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
-{
-        int err = jbd2_journal_dirty_data(handle, bh);
-        if (err)
-                ext4_journal_abort_handle(__func__, __func__,
-                                                bh, handle, err);
-        return err;
-}
 /* For write_end() in data=journal mode */
 static int write_end_fn(handle_t *handle, struct buffer_head *bh)
 {
@@ -1255,29 +1323,6 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
 }
 /*
- * Generic write_end handler for ordered and writeback ext4 journal modes.
- * We can't use generic_write_end, because that unlocks the page and we need to
- * unlock the page after ext4_journal_stop, but ext4_journal_stop must run
- * after block_write_end.
- */
-static int ext4_generic_write_end(struct file *file,
-                                struct address_space *mapping,
-                                loff_t pos, unsigned len, unsigned copied,
-                                struct page *page, void *fsdata)
-{
-        struct inode *inode = file->f_mapping->host;
-        copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
-        if (pos+copied > inode->i_size) {
-                i_size_write(inode, pos+copied);
-                mark_inode_dirty(inode);
-        }
-        return copied;
-}
-/*
 * We need to pick up the new inode size which generic_commit_write gave us
 * `file' can be NULL - eg, when called from page_symlink().
 *
@@ -1290,15 +1335,14 @@ static int ext4_ordered_write_end(struct file *file,
                                struct page *page, void *fsdata)
 {
        handle_t *handle = ext4_journal_current_handle();
-        struct inode *inode = file->f_mapping->host;
+        struct inode *inode = mapping->host;
        unsigned from, to;
        int ret = 0, ret2;
        from = pos & (PAGE_CACHE_SIZE - 1);
        to = from + len;
-        ret = walk_page_buffers(handle, page_buffers(page),
+        ret = ext4_jbd2_file_inode(handle, inode);
-                from, to, NULL, ext4_journal_dirty_data);
        if (ret == 0) {
                /*
@@ -1311,7 +1355,7 @@ static int ext4_ordered_write_end(struct file *file,
                new_i_size = pos + copied;
                if (new_i_size > EXT4_I(inode)->i_disksize)
                        EXT4_I(inode)->i_disksize = new_i_size;
-                ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
+                ret2 = generic_write_end(file, mapping, pos, len, copied,
                                                        page, fsdata);
                copied = ret2;
                if (ret2 < 0)
@@ -1320,8 +1364,6 @@ static int ext4_ordered_write_end(struct file *file,
        ret2 = ext4_journal_stop(handle);
        if (!ret)
                ret = ret2;
-        unlock_page(page);
-        page_cache_release(page);
        return ret ? ret : copied;
 }
@@ -1332,7 +1374,7 @@ static int ext4_writeback_write_end(struct file *file,
                                struct page *page, void *fsdata)
 {
        handle_t *handle = ext4_journal_current_handle();
-        struct inode *inode = file->f_mapping->host;
+        struct inode *inode = mapping->host;
        int ret = 0, ret2;
        loff_t new_i_size;
@@ -1340,7 +1382,7 @@ static int ext4_writeback_write_end(struct file *file,
        if (new_i_size > EXT4_I(inode)->i_disksize)
                EXT4_I(inode)->i_disksize = new_i_size;
-        ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
+        ret2 = generic_write_end(file, mapping, pos, len, copied,
                                                        page, fsdata);
        copied = ret2;
        if (ret2 < 0)
@@ -1349,8 +1391,6 @@ static int ext4_writeback_write_end(struct file *file,
        ret2 = ext4_journal_stop(handle);
        if (!ret)
                ret = ret2;
-        unlock_page(page);
-        page_cache_release(page);
        return ret ? ret : copied;
 }
@@ -1389,14 +1429,965 @@ static int ext4_journalled_write_end(struct file *file,
                        ret = ret2;
        }
+        unlock_page(page);
        ret2 = ext4_journal_stop(handle);
        if (!ret)
                ret = ret2;
-        unlock_page(page);
        page_cache_release(page);
        return ret ? ret : copied;
 }
+/*
+ * Calculate the number of metadata blocks need to reserve
+ * to allocate @blocks for non extent file based file
+ */
+static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)
+{
+        int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+        int ind_blks, dind_blks, tind_blks;
+        /* number of new indirect blocks needed */
+        ind_blks = (blocks + icap - 1) / icap;
+        dind_blks = (ind_blks + icap - 1) / icap;
+        tind_blks = 1;
+        return ind_blks + dind_blks + tind_blks;
+}
+/*
+ * Calculate the number of metadata blocks need to reserve
+ * to allocate given number of blocks
+ */
+static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
+{
+        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+                return ext4_ext_calc_metadata_amount(inode, blocks);
+        return ext4_indirect_calc_metadata_amount(inode, blocks);
+}
+static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+       unsigned long md_needed, mdblocks, total = 0;
+        /*
+         * recalculate the amount of metadata blocks to reserve
+         * in order to allocate nrblocks
+         * worse case is one extent per block
+         */
+        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+        total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
+        mdblocks = ext4_calc_metadata_amount(inode, total);
+        BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks);
+        md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
+        total = md_needed + nrblocks;
+        if (ext4_has_free_blocks(sbi, total) < total) {
+                spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+                return -ENOSPC;
+        }
+        /* reduce fs free blocks counter */
+        percpu_counter_sub(&sbi->s_freeblocks_counter, total);
+        EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
+        EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
+        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+        return 0;       /* success */
+}
+void ext4_da_release_space(struct inode *inode, int used, int to_free)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+        int total, mdb, mdb_free, release;
+        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+        /* recalculate the number of metablocks still need to be reserved */
+        total = EXT4_I(inode)->i_reserved_data_blocks - used - to_free;
+        mdb = ext4_calc_metadata_amount(inode, total);
+        /* figure out how many metablocks to release */
+        BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+        mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
+        /* Account for allocated meta_blocks */
+        mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
+        release = to_free + mdb_free;
+        /* update fs free blocks counter for truncate case */
+        percpu_counter_add(&sbi->s_freeblocks_counter, release);
+        /* update per-inode reservations */
+        BUG_ON(used + to_free > EXT4_I(inode)->i_reserved_data_blocks);
+        EXT4_I(inode)->i_reserved_data_blocks -= (used + to_free);
+        BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+        EXT4_I(inode)->i_reserved_meta_blocks = mdb;
+        EXT4_I(inode)->i_allocated_meta_blocks = 0;
+        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+}
+static void ext4_da_page_release_reservation(struct page *page,
+                                                unsigned long offset)
+{
+        int to_release = 0;
+        struct buffer_head *head, *bh;
+        unsigned int curr_off = 0;
+        head = page_buffers(page);
+        bh = head;
+        do {
+                unsigned int next_off = curr_off + bh->b_size;
+                if ((offset <= curr_off) && (buffer_delay(bh))) {
+                        to_release++;
+                        clear_buffer_delay(bh);
+                }
+                curr_off = next_off;
+        } while ((bh = bh->b_this_page) != head);
+        ext4_da_release_space(page->mapping->host, 0, to_release);
+}
+/*
+ * Delayed allocation stuff
+ */
+struct mpage_da_data {
+        struct inode *inode;
+        struct buffer_head lbh;                 /* extent of blocks */
+        unsigned long first_page, next_page;    /* extent of pages */
+        get_block_t *get_block;
+        struct writeback_control *wbc;
+};
+/*
+ * mpage_da_submit_io - walks through extent of pages and try to write
+ * them with __mpage_writepage()
+ *
+ * @mpd->inode: inode
+ * @mpd->first_page: first page of the extent
+ * @mpd->next_page: page after the last page of the extent
+ * @mpd->get_block: the filesystem's block mapper function
+ *
+ * By the time mpage_da_submit_io() is called we expect all blocks
+ * to be allocated. this may be wrong if allocation failed.
+ *
+ * As pages are already locked by write_cache_pages(), we can't use it
+ */
+static int mpage_da_submit_io(struct mpage_da_data *mpd)
+{
+        struct address_space *mapping = mpd->inode->i_mapping;
+        struct mpage_data mpd_pp = {
+                .bio = NULL,
+                .last_block_in_bio = 0,
+                .get_block = mpd->get_block,
+                .use_writepage = 1,
+        };
+        int ret = 0, err, nr_pages, i;
+        unsigned long index, end;
+        struct pagevec pvec;
+        BUG_ON(mpd->next_page <= mpd->first_page);
+        pagevec_init(&pvec, 0);
+        index = mpd->first_page;
+        end = mpd->next_page - 1;
+        while (index <= end) {
+                /* XXX: optimize tail */
+                nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+                if (nr_pages == 0)
+                        break;
+                for (i = 0; i < nr_pages; i++) {
+                        struct page *page = pvec.pages[i];
+                        index = page->index;
+                        if (index > end)
+                                break;
+                        index++;
+                        err = __mpage_writepage(page, mpd->wbc, &mpd_pp);
+                        /*
+                         * In error case, we have to continue because
+                         * remaining pages are still locked
+                         * XXX: unlock and re-dirty them?
+                         */
+                        if (ret == 0)
+                                ret = err;
+                }
+                pagevec_release(&pvec);
+        }
+        if (mpd_pp.bio)
+                mpage_bio_submit(WRITE, mpd_pp.bio);
+        return ret;
+}
+/*
+ * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
+ *
+ * @mpd->inode - inode to walk through
+ * @exbh->b_blocknr - first block on a disk
+ * @exbh->b_size - amount of space in bytes
+ * @logical - first logical block to start assignment with
+ *
+ * the function goes through all passed space and put actual disk
+ * block numbers into buffer heads, dropping BH_Delay
+ */
+static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
+                                 struct buffer_head *exbh)
+{
+        struct inode *inode = mpd->inode;
+        struct address_space *mapping = inode->i_mapping;
+        int blocks = exbh->b_size >> inode->i_blkbits;
+        sector_t pblock = exbh->b_blocknr, cur_logical;
+        struct buffer_head *head, *bh;
+        unsigned long index, end;
+        struct pagevec pvec;
+        int nr_pages, i;
+        index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        pagevec_init(&pvec, 0);
+        while (index <= end) {
+                /* XXX: optimize tail */
+                nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+                if (nr_pages == 0)
+                        break;
+                for (i = 0; i < nr_pages; i++) {
+                        struct page *page = pvec.pages[i];
+                        index = page->index;
+                        if (index > end)
+                                break;
+                        index++;
+                        BUG_ON(!PageLocked(page));
+                        BUG_ON(PageWriteback(page));
+                        BUG_ON(!page_has_buffers(page));
+                        bh = page_buffers(page);
+                        head = bh;
+                        /* skip blocks out of the range */
+                        do {
+                                if (cur_logical >= logical)
+                                        break;
+                                cur_logical++;
+                        } while ((bh = bh->b_this_page) != head);
+                        do {
+                                if (cur_logical >= logical + blocks)
+                                        break;
+                                if (buffer_delay(bh)) {
+                                        bh->b_blocknr = pblock;
+                                        clear_buffer_delay(bh);
+                                } else if (buffer_mapped(bh))
+                                        BUG_ON(bh->b_blocknr != pblock);
+                                cur_logical++;
+                                pblock++;
+                        } while ((bh = bh->b_this_page) != head);
+                }
+                pagevec_release(&pvec);
+        }
+}
+/*
+ * __unmap_underlying_blocks - just a helper function to unmap
+ * set of blocks described by @bh
+ */
+static inline void __unmap_underlying_blocks(struct inode *inode,
+                                             struct buffer_head *bh)
+{
+        struct block_device *bdev = inode->i_sb->s_bdev;
+        int blocks, i;
+        blocks = bh->b_size >> inode->i_blkbits;
+        for (i = 0; i < blocks; i++)
+                unmap_underlying_metadata(bdev, bh->b_blocknr + i);
+}
+/*
+ * mpage_da_map_blocks - go through given space
+ *
+ * @mpd->lbh - bh describing space
+ * @mpd->get_block - the filesystem's block mapper function
+ *
+ * The function skips space we know is already mapped to disk blocks.
+ *
+ * The function ignores errors ->get_block() returns, thus real
+ * error handling is postponed to __mpage_writepage()
+ */
+static void mpage_da_map_blocks(struct mpage_da_data *mpd)
+{
+        struct buffer_head *lbh = &mpd->lbh;
+        int err = 0, remain = lbh->b_size;
+        sector_t next = lbh->b_blocknr;
+        struct buffer_head new;
+        /*
+         * We consider only non-mapped and non-allocated blocks
+         */
+        if (buffer_mapped(lbh) && !buffer_delay(lbh))
+                return;
+        while (remain) {
+                new.b_state = lbh->b_state;
+                new.b_blocknr = 0;
+                new.b_size = remain;
+                err = mpd->get_block(mpd->inode, next, &new, 1);
+                if (err) {
+                        /*
+                         * Rather than implement own error handling
+                         * here, we just leave remaining blocks
+                         * unallocated and try again with ->writepage()
+                         */
+                        break;
+                }
+                BUG_ON(new.b_size == 0);
+                if (buffer_new(&new))
+                        __unmap_underlying_blocks(mpd->inode, &new);
+                /*
+                 * If blocks are delayed marked, we need to
+                 * put actual blocknr and drop delayed bit
+                 */
+                if (buffer_delay(lbh))
+                        mpage_put_bnr_to_bhs(mpd, next, &new);
+                /* go for the remaining blocks */
+                next += new.b_size >> mpd->inode->i_blkbits;
+                remain -= new.b_size;
+        }
+}
+#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay))
+/*
+ * mpage_add_bh_to_extent - try to add one more block to extent of blocks
+ *
+ * @mpd->lbh - extent of blocks
+ * @logical - logical number of the block in the file
+ * @bh - bh of the block (used to access block's state)
+ *
+ * the function is used to collect contig. blocks in same state
+ */
+static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
+                                   sector_t logical, struct buffer_head *bh)
+{
+        struct buffer_head *lbh = &mpd->lbh;
+        sector_t next;
+        next = lbh->b_blocknr + (lbh->b_size >> mpd->inode->i_blkbits);
+        /*
+         * First block in the extent
+         */
+        if (lbh->b_size == 0) {
+                lbh->b_blocknr = logical;
+                lbh->b_size = bh->b_size;
+                lbh->b_state = bh->b_state & BH_FLAGS;
+                return;
+        }
+        /*
+         * Can we merge the block to our big extent?
+         */
+        if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) {
+                lbh->b_size += bh->b_size;
+                return;
+        }
+        /*
+         * We couldn't merge the block to our extent, so we
+         * need to flush current  extent and start new one
+         */
+        mpage_da_map_blocks(mpd);
+        /*
+         * Now start a new extent
+         */
+        lbh->b_size = bh->b_size;
+        lbh->b_state = bh->b_state & BH_FLAGS;
+        lbh->b_blocknr = logical;
+}
+/*
+ * __mpage_da_writepage - finds extent of pages and blocks
+ *
+ * @page: page to consider
+ * @wbc: not used, we just follow rules
+ * @data: context
+ *
+ * The function finds extents of pages and scan them for all blocks.
+ */
+static int __mpage_da_writepage(struct page *page,
+                                struct writeback_control *wbc, void *data)
+{
+        struct mpage_da_data *mpd = data;
+        struct inode *inode = mpd->inode;
+        struct buffer_head *bh, *head, fake;
+        sector_t logical;
+        /*
+         * Can we merge this page to current extent?
+         */
+        if (mpd->next_page != page->index) {
+                /*
+                 * Nope, we can't. So, we map non-allocated blocks
+                 * and start IO on them using __mpage_writepage()
+                 */
+                if (mpd->next_page != mpd->first_page) {
+                        mpage_da_map_blocks(mpd);
+                        mpage_da_submit_io(mpd);
+                }
+                /*
+                 * Start next extent of pages ...
+                 */
+                mpd->first_page = page->index;
+                /*
+                 * ... and blocks
+                 */
+                mpd->lbh.b_size = 0;
+                mpd->lbh.b_state = 0;
+                mpd->lbh.b_blocknr = 0;
+        }
+        mpd->next_page = page->index + 1;
+        logical = (sector_t) page->index <<
+                  (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        if (!page_has_buffers(page)) {
+                /*
+                 * There is no attached buffer heads yet (mmap?)
+                 * we treat the page asfull of dirty blocks
+                 */
+                bh = &fake;
+                bh->b_size = PAGE_CACHE_SIZE;
+                bh->b_state = 0;
+                set_buffer_dirty(bh);
+                set_buffer_uptodate(bh);
+                mpage_add_bh_to_extent(mpd, logical, bh);
+        } else {
+                /*
+                 * Page with regular buffer heads, just add all dirty ones
+                 */
+                head = page_buffers(page);
+                bh = head;
+                do {
+                        BUG_ON(buffer_locked(bh));
+                        if (buffer_dirty(bh))
+                                mpage_add_bh_to_extent(mpd, logical, bh);
+                        logical++;
+                } while ((bh = bh->b_this_page) != head);
+        }
+        return 0;
+}
+/*
+ * mpage_da_writepages - walk the list of dirty pages of the given
+ * address space, allocates non-allocated blocks, maps newly-allocated
+ * blocks to existing bhs and issue IO them
+ *
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ * @get_block: the filesystem's block mapper function.
+ *
+ * This is a library function, which implements the writepages()
+ * address_space_operation.
+ *
+ * In order to avoid duplication of logic that deals with partial pages,
+ * multiple bio per page, etc, we find non-allocated blocks, allocate
+ * them with minimal calls to ->get_block() and re-use __mpage_writepage()
+ *
+ * It's important that we call __mpage_writepage() only once for each
+ * involved page, otherwise we'd have to implement more complicated logic
+ * to deal with pages w/o PG_lock or w/ PG_writeback and so on.
+ *
+ * See comments to mpage_writepages()
+ */
+static int mpage_da_writepages(struct address_space *mapping,
+                               struct writeback_control *wbc,
+                               get_block_t get_block)
+{
+        struct mpage_da_data mpd;
+        int ret;
+        if (!get_block)
+                return generic_writepages(mapping, wbc);
+        mpd.wbc = wbc;
+        mpd.inode = mapping->host;
+        mpd.lbh.b_size = 0;
+        mpd.lbh.b_state = 0;
+        mpd.lbh.b_blocknr = 0;
+        mpd.first_page = 0;
+        mpd.next_page = 0;
+        mpd.get_block = get_block;
+        ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);
+        /*
+         * Handle last extent of pages
+         */
+        if (mpd.next_page != mpd.first_page) {
+                mpage_da_map_blocks(&mpd);
+                mpage_da_submit_io(&mpd);
+        }
+        return ret;
+}
+/*
+ * this is a special callback for ->write_begin() only
+ * it's intention is to return mapped block or reserve space
+ */
+static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
+                                  struct buffer_head *bh_result, int create)
+{
+        int ret = 0;
+        BUG_ON(create == 0);
+        BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
+        /*
+         * first, we need to know whether the block is allocated already
+         * preallocated blocks are unmapped but should treated
+         * the same as allocated blocks.
+         */
+        ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1,  bh_result, 0, 0, 0);
+        if ((ret == 0) && !buffer_delay(bh_result)) {
+                /* the block isn't (pre)allocated yet, let's reserve space */
+                /*
+                 * XXX: __block_prepare_write() unmaps passed block,
+                 * is it OK?
+                 */
+                ret = ext4_da_reserve_space(inode, 1);
+                if (ret)
+                        /* not enough space to reserve */
+                        return ret;
+                map_bh(bh_result, inode->i_sb, 0);
+                set_buffer_new(bh_result);
+                set_buffer_delay(bh_result);
+        } else if (ret > 0) {
+                bh_result->b_size = (ret << inode->i_blkbits);
+                ret = 0;
+        }
+        return ret;
+}
+#define         EXT4_DELALLOC_RSVED     1
+static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
+                                   struct buffer_head *bh_result, int create)
+{
+        int ret;
+        unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+        loff_t disksize = EXT4_I(inode)->i_disksize;
+        handle_t *handle = NULL;
+        handle = ext4_journal_current_handle();
+        if (!handle) {
+                ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+                                   bh_result, 0, 0, 0);
+                BUG_ON(!ret);
+        } else {
+                ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+                                   bh_result, create, 0, EXT4_DELALLOC_RSVED);
+        }
+        if (ret > 0) {
+                bh_result->b_size = (ret << inode->i_blkbits);
+                /*
+                 * Update on-disk size along with block allocation
+                 * we don't use 'extend_disksize' as size may change
+                 * within already allocated block -bzzz
+                 */
+                disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
+                if (disksize > i_size_read(inode))
+                        disksize = i_size_read(inode);
+                if (disksize > EXT4_I(inode)->i_disksize) {
+                        /*
+                         * XXX: replace with spinlock if seen contended -bzzz
+                         */
+                        down_write(&EXT4_I(inode)->i_data_sem);
+                        if (disksize > EXT4_I(inode)->i_disksize)
+                                EXT4_I(inode)->i_disksize = disksize;
+                        up_write(&EXT4_I(inode)->i_data_sem);
+                        if (EXT4_I(inode)->i_disksize == disksize) {
+                                ret = ext4_mark_inode_dirty(handle, inode);
+                                return ret;
+                        }
+                }
+                ret = 0;
+        }
+        return ret;
+}
+static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
+{
+        /*
+         * unmapped buffer is possible for holes.
+         * delay buffer is possible with delayed allocation
+         */
+        return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh));
+}
+static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
+                                   struct buffer_head *bh_result, int create)
+{
+        int ret = 0;
+        unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+        /*
+         * we don't want to do block allocation in writepage
+         * so call get_block_wrap with create = 0
+         */
+        ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks,
+                                   bh_result, 0, 0, 0);
+        if (ret > 0) {
+                bh_result->b_size = (ret << inode->i_blkbits);
+                ret = 0;
+        }
+        return ret;
+}
+/*
+ * get called vi ext4_da_writepages after taking page lock (have journal handle)
+ * get called via journal_submit_inode_data_buffers (no journal handle)
+ * get called via shrink_page_list via pdflush (no journal handle)
+ * or grab_page_cache when doing write_begin (have journal handle)
+ */
+static int ext4_da_writepage(struct page *page,
+                                struct writeback_control *wbc)
+{
+        int ret = 0;
+        loff_t size;
+        unsigned long len;
+        struct buffer_head *page_bufs;
+        struct inode *inode = page->mapping->host;
+        size = i_size_read(inode);
+        if (page->index == size >> PAGE_CACHE_SHIFT)
+                len = size & ~PAGE_CACHE_MASK;
+        else
+                len = PAGE_CACHE_SIZE;
+        if (page_has_buffers(page)) {
+                page_bufs = page_buffers(page);
+                if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+                                        ext4_bh_unmapped_or_delay)) {
+                        /*
+                         * We don't want to do  block allocation
+                         * So redirty the page and return
+                         * We may reach here when we do a journal commit
+                         * via journal_submit_inode_data_buffers.
+                         * If we don't have mapping block we just ignore
+                         * them. We can also reach here via shrink_page_list
+                         */
+                        redirty_page_for_writepage(wbc, page);
+                        unlock_page(page);
+                        return 0;
+                }
+        } else {
+                /*
+                 * The test for page_has_buffers() is subtle:
+                 * We know the page is dirty but it lost buffers. That means
+                 * that at some moment in time after write_begin()/write_end()
+                 * has been called all buffers have been clean and thus they
+                 * must have been written at least once. So they are all
+                 * mapped and we can happily proceed with mapping them
+                 * and writing the page.
+                 *
+                 * Try to initialize the buffer_heads and check whether
+                 * all are mapped and non delay. We don't want to
+                 * do block allocation here.
+                 */
+                ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
+                                                ext4_normal_get_block_write);
+                if (!ret) {
+                        page_bufs = page_buffers(page);
+                        /* check whether all are mapped and non delay */
+                        if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+                                                ext4_bh_unmapped_or_delay)) {
+                                redirty_page_for_writepage(wbc, page);
+                                unlock_page(page);
+                                return 0;
+                        }
+                } else {
+                        /*
+                         * We can't do block allocation here
+                         * so just redity the page and unlock
+                         * and return
+                         */
+                        redirty_page_for_writepage(wbc, page);
+                        unlock_page(page);
+                        return 0;
+                }
+        }
+        if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
+                ret = nobh_writepage(page, ext4_normal_get_block_write, wbc);
+        else
+                ret = block_write_full_page(page,
+                                                ext4_normal_get_block_write,
+                                                wbc);
+        return ret;
+}
+/*
+ * For now just follow the DIO way to estimate the max credits
+ * needed to write out EXT4_MAX_WRITEBACK_PAGES.
+ * todo: need to calculate the max credits need for
+ * extent based files, currently the DIO credits is based on
+ * indirect-blocks mapping way.
+ *
+ * Probably should have a generic way to calculate credits
+ * for DIO, writepages, and truncate
+ */
+#define EXT4_MAX_WRITEBACK_PAGES      DIO_MAX_BLOCKS
+#define EXT4_MAX_WRITEBACK_CREDITS    DIO_CREDITS
+static int ext4_da_writepages(struct address_space *mapping,
+                                struct writeback_control *wbc)
+{
+        struct inode *inode = mapping->host;
+        handle_t *handle = NULL;
+        int needed_blocks;
+        int ret = 0;
+        long to_write;
+        loff_t range_start = 0;
+        /*
+         * No pages to write? This is mainly a kludge to avoid starting
+         * a transaction for special inodes like journal inode on last iput()
+         * because that could violate lock ordering on umount
+         */
+        if (!mapping->nrpages)
+                return 0;
+        /*
+         * Estimate the worse case needed credits to write out
+         * EXT4_MAX_BUF_BLOCKS pages
+         */
+        needed_blocks = EXT4_MAX_WRITEBACK_CREDITS;
+        to_write = wbc->nr_to_write;
+        if (!wbc->range_cyclic) {
+                /*
+                 * If range_cyclic is not set force range_cont
+                 * and save the old writeback_index
+                 */
+                wbc->range_cont = 1;
+                range_start =  wbc->range_start;
+        }
+        while (!ret && to_write) {
+                /* start a new transaction*/
+                handle = ext4_journal_start(inode, needed_blocks);
+                if (IS_ERR(handle)) {
+                        ret = PTR_ERR(handle);
+                        goto out_writepages;
+                }
+                if (ext4_should_order_data(inode)) {
+                        /*
+                         * With ordered mode we need to add
+                         * the inode to the journal handle
+                         * when we do block allocation.
+                         */
+                        ret = ext4_jbd2_file_inode(handle, inode);
+                        if (ret) {
+                                ext4_journal_stop(handle);
+                                goto out_writepages;
+                        }
+                }
+                /*
+                 * set the max dirty pages could be write at a time
+                 * to fit into the reserved transaction credits
+                 */
+                if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES)
+                        wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;
+                to_write -= wbc->nr_to_write;
+                ret = mpage_da_writepages(mapping, wbc,
+                                                ext4_da_get_block_write);
+                ext4_journal_stop(handle);
+                if (wbc->nr_to_write) {
+                        /*
+                         * There is no more writeout needed
+                         * or we requested for a noblocking writeout
+                         * and we found the device congested
+                         */
+                        to_write += wbc->nr_to_write;
+                        break;
+                }
+                wbc->nr_to_write = to_write;
+        }
+out_writepages:
+        wbc->nr_to_write = to_write;
+        if (range_start)
+                wbc->range_start = range_start;
+        return ret;
+}
+static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
+                                loff_t pos, unsigned len, unsigned flags,
+                                struct page **pagep, void **fsdata)
+{
+        int ret, retries = 0;
+        struct page *page;
+        pgoff_t index;
+        unsigned from, to;
+        struct inode *inode = mapping->host;
+        handle_t *handle;
+        index = pos >> PAGE_CACHE_SHIFT;
+        from = pos & (PAGE_CACHE_SIZE - 1);
+        to = from + len;
+retry:
+        /*
+         * With delayed allocation, we don't log the i_disksize update
+         * if there is delayed block allocation. But we still need
+         * to journalling the i_disksize update if writes to the end
+         * of file which has an already mapped buffer.
+         */
+        handle = ext4_journal_start(inode, 1);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                goto out;
+        }
+        page = __grab_cache_page(mapping, index);
+        if (!page)
+                return -ENOMEM;
+        *pagep = page;
+        ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+                                                        ext4_da_get_block_prep);
+        if (ret < 0) {
+                unlock_page(page);
+                ext4_journal_stop(handle);
+                page_cache_release(page);
+        }
+        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+                goto retry;
+out:
+        return ret;
+}
+/*
+ * Check if we should update i_disksize
+ * when write to the end of file but not require block allocation
+ */
+static int ext4_da_should_update_i_disksize(struct page *page,
+                                         unsigned long offset)
+{
+        struct buffer_head *bh;
+        struct inode *inode = page->mapping->host;
+        unsigned int idx;
+        int i;
+        bh = page_buffers(page);
+        idx = offset >> inode->i_blkbits;
+        for (i=0; i < idx; i++)
+                bh = bh->b_this_page;
+        if (!buffer_mapped(bh) || (buffer_delay(bh)))
+                return 0;
+        return 1;
+}
+static int ext4_da_write_end(struct file *file,
+                                struct address_space *mapping,
+                                loff_t pos, unsigned len, unsigned copied,
+                                struct page *page, void *fsdata)
+{
+        struct inode *inode = mapping->host;
+        int ret = 0, ret2;
+        handle_t *handle = ext4_journal_current_handle();
+        loff_t new_i_size;
+        unsigned long start, end;
+        start = pos & (PAGE_CACHE_SIZE - 1);
+        end = start + copied -1;
+        /*
+         * generic_write_end() will run mark_inode_dirty() if i_size
+         * changes.  So let's piggyback the i_disksize mark_inode_dirty
+         * into that.
+         */
+        new_i_size = pos + copied;
+        if (new_i_size > EXT4_I(inode)->i_disksize) {
+                if (ext4_da_should_update_i_disksize(page, end)) {
+                        down_write(&EXT4_I(inode)->i_data_sem);
+                        if (new_i_size > EXT4_I(inode)->i_disksize) {
+                                /*
+                                 * Updating i_disksize when extending file
+                                 * without needing block allocation
+                                 */
+                                if (ext4_should_order_data(inode))
+                                        ret = ext4_jbd2_file_inode(handle,
+                                                                   inode);
+                                EXT4_I(inode)->i_disksize = new_i_size;
+                        }
+                        up_write(&EXT4_I(inode)->i_data_sem);
+                }
+        }
+        ret2 = generic_write_end(file, mapping, pos, len, copied,
+                                                        page, fsdata);
+        copied = ret2;
+        if (ret2 < 0)
+                ret = ret2;
+        ret2 = ext4_journal_stop(handle);
+        if (!ret)
+                ret = ret2;
+        return ret ? ret : copied;
+}
+static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
+{
+        /*
+         * Drop reserved blocks
+         */
+        BUG_ON(!PageLocked(page));
+        if (!page_has_buffers(page))
+                goto out;
+        ext4_da_page_release_reservation(page, offset);
+out:
+        ext4_invalidatepage(page, offset);
+        return;
+}
 /*
 * bmap() is special.  It gets used by applications such as lilo and by
@@ -1418,6 +2409,16 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
        journal_t *journal;
        int err;
+        if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
+                        test_opt(inode->i_sb, DELALLOC)) {
+                /*
+                 * With delalloc we want to sync the file
+                 * so that we can make sure we allocate
+                 * blocks for file
+                 */
+                filemap_write_and_wait(mapping);
+        }
        if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
                /*
                 * This is a REALLY heavyweight approach, but the use of
@@ -1462,21 +2463,17 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
        return 0;
 }
-static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
-{
-        if (buffer_mapped(bh))
-                return ext4_journal_dirty_data(handle, bh);
-        return 0;
-}
 /*
- * Note that we always start a transaction even if we're not journalling
+ * Note that we don't need to start a transaction unless we're journaling data
- * data.  This is to preserve ordering: any hole instantiation within
+ * because we should have holes filled from ext4_page_mkwrite(). We even don't
- * __block_write_full_page -> ext4_get_block() should be journalled
+ * need to file the inode to the transaction's list in ordered mode because if
- * along with the data so we don't crash and then get metadata which
+ * we are writing back data added by write(), the inode is already there and if
- * refers to old data.
+ * we are writing back data modified via mmap(), noone guarantees in which
+ * transaction the data will hit the disk. In case we are journaling data, we
+ * cannot start transaction directly because transaction start ranks above page
+ * lock so we have to do some magic.
 *
- * In all journalling modes block_write_full_page() will start the I/O.
+ * In all journaling modes block_write_full_page() will start the I/O.
 *
 * Problem:
 *
@@ -1518,105 +2515,103 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
 * disastrous.  Any write() or metadata operation will sync the fs for
 * us.
 *
- * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
- * we don't need to open a transaction here.
 */
-static int ext4_ordered_writepage(struct page *page,
+static int __ext4_normal_writepage(struct page *page,
                                struct writeback_control *wbc)
 {
        struct inode *inode = page->mapping->host;
-        struct buffer_head *page_bufs;
-        handle_t *handle = NULL;
-        int ret = 0;
-        int err;
-        J_ASSERT(PageLocked(page));
-        /*
-         * We give up here if we're reentered, because it might be for a
-         * different filesystem.
-         */
-        if (ext4_journal_current_handle())
-                goto out_fail;
-        handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+        if (test_opt(inode->i_sb, NOBH))
+                return nobh_writepage(page,
+                                        ext4_normal_get_block_write, wbc);
+        else
+                return block_write_full_page(page,
+                                                ext4_normal_get_block_write,
+                                                wbc);
+}
-        if (IS_ERR(handle)) {
+static int ext4_normal_writepage(struct page *page,
-                ret = PTR_ERR(handle);
+                                struct writeback_control *wbc)
-                goto out_fail;
+{
-        }
+        struct inode *inode = page->mapping->host;
+        loff_t size = i_size_read(inode);
+        loff_t len;
-        if (!page_has_buffers(page)) {
+        J_ASSERT(PageLocked(page));
-                create_empty_buffers(page, inode->i_sb->s_blocksize,
+        if (page->index == size >> PAGE_CACHE_SHIFT)
-                                (1 << BH_Dirty)|(1 << BH_Uptodate));
+                len = size & ~PAGE_CACHE_MASK;
+        else
+                len = PAGE_CACHE_SIZE;
+        if (page_has_buffers(page)) {
+                /* if page has buffers it should all be mapped
+                 * and allocated. If there are not buffers attached
+                 * to the page we know the page is dirty but it lost
+                 * buffers. That means that at some moment in time
+                 * after write_begin() / write_end() has been called
+                 * all buffers have been clean and thus they must have been
+                 * written at least once. So they are all mapped and we can
+                 * happily proceed with mapping them and writing the page.
+                 */
+                BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+                                        ext4_bh_unmapped_or_delay));
        }
-        page_bufs = page_buffers(page);
-        walk_page_buffers(handle, page_bufs, 0,
-                        PAGE_CACHE_SIZE, NULL, bget_one);
-        ret = block_write_full_page(page, ext4_get_block, wbc);
-        /*
+        if (!ext4_journal_current_handle())
-         * The page can become unlocked at any point now, and
+                return __ext4_normal_writepage(page, wbc);
-         * truncate can then come in and change things.  So we
-         * can't touch *page from now on.  But *page_bufs is
-         * safe due to elevated refcount.
-         */
-        /*
-         * And attach them to the current transaction.  But only if
-         * block_write_full_page() succeeded.  Otherwise they are unmapped,
-         * and generally junk.
-         */
-        if (ret == 0) {
-                err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
-                                        NULL, jbd2_journal_dirty_data_fn);
-                if (!ret)
-                        ret = err;
-        }
-        walk_page_buffers(handle, page_bufs, 0,
-                        PAGE_CACHE_SIZE, NULL, bput_one);
-        err = ext4_journal_stop(handle);
-        if (!ret)
-                ret = err;
-        return ret;
-out_fail:
        redirty_page_for_writepage(wbc, page);
        unlock_page(page);
-        return ret;
+        return 0;
 }
-static int ext4_writeback_writepage(struct page *page,
+static int __ext4_journalled_writepage(struct page *page,
                                struct writeback_control *wbc)
 {
-        struct inode *inode = page->mapping->host;
+        struct address_space *mapping = page->mapping;
+        struct inode *inode = mapping->host;
+        struct buffer_head *page_bufs;
        handle_t *handle = NULL;
        int ret = 0;
        int err;
-        if (ext4_journal_current_handle())
+        ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
-                goto out_fail;
+                                        ext4_normal_get_block_write);
+        if (ret != 0)
+                goto out_unlock;
+        page_bufs = page_buffers(page);
+        walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
+                                                                bget_one);
+        /* As soon as we unlock the page, it can go away, but we have
+         * references to buffers so we are safe */
+        unlock_page(page);
        handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
-                goto out_fail;
+                goto out;
        }
-        if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
+        ret = walk_page_buffers(handle, page_bufs, 0,
-                ret = nobh_writepage(page, ext4_get_block, wbc);
+                        PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
-        else
-                ret = block_write_full_page(page, ext4_get_block, wbc);
+        err = walk_page_buffers(handle, page_bufs, 0,
+                                PAGE_CACHE_SIZE, NULL, write_end_fn);
+        if (ret == 0)
+                ret = err;
        err = ext4_journal_stop(handle);
        if (!ret)
                ret = err;
-        return ret;
-out_fail:
+        walk_page_buffers(handle, page_bufs, 0,
-        redirty_page_for_writepage(wbc, page);
+                                PAGE_CACHE_SIZE, NULL, bput_one);
+        EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
+        goto out;
+out_unlock:
        unlock_page(page);
+out:
        return ret;
 }
@@ -1624,59 +2619,53 @@ static int ext4_journalled_writepage(struct page *page,
                                struct writeback_control *wbc)
 {
        struct inode *inode = page->mapping->host;
-        handle_t *handle = NULL;
+        loff_t size = i_size_read(inode);
-        int ret = 0;
+        loff_t len;
-        int err;
-        if (ext4_journal_current_handle())
+        J_ASSERT(PageLocked(page));
-                goto no_write;
+        if (page->index == size >> PAGE_CACHE_SHIFT)
+                len = size & ~PAGE_CACHE_MASK;
+        else
+                len = PAGE_CACHE_SIZE;
+        if (page_has_buffers(page)) {
+                /* if page has buffers it should all be mapped
+                 * and allocated. If there are not buffers attached
+                 * to the page we know the page is dirty but it lost
+                 * buffers. That means that at some moment in time
+                 * after write_begin() / write_end() has been called
+                 * all buffers have been clean and thus they must have been
+                 * written at least once. So they are all mapped and we can
+                 * happily proceed with mapping them and writing the page.
+                 */
+                BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+                                        ext4_bh_unmapped_or_delay));
+        }
-        handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+        if (ext4_journal_current_handle())
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
                goto no_write;
-        }
-        if (!page_has_buffers(page) || PageChecked(page)) {
+        if (PageChecked(page)) {
                /*
                 * It's mmapped pagecache.  Add buffers and journal it.  There
                 * doesn't seem much point in redirtying the page here.
                 */
                ClearPageChecked(page);
-                ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
+                return __ext4_journalled_writepage(page, wbc);
-                                        ext4_get_block);
-                if (ret != 0) {
-                        ext4_journal_stop(handle);
-                        goto out_unlock;
-                }
-                ret = walk_page_buffers(handle, page_buffers(page), 0,
-                        PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
-                err = walk_page_buffers(handle, page_buffers(page), 0,
-                                PAGE_CACHE_SIZE, NULL, write_end_fn);
-                if (ret == 0)
-                        ret = err;
-                EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
-                unlock_page(page);
        } else {
                /*
                 * It may be a page full of checkpoint-mode buffers.  We don't
                 * really know unless we go poke around in the buffer_heads.
                 * But block_write_full_page will do the right thing.
                 */
-                ret = block_write_full_page(page, ext4_get_block, wbc);
+                return block_write_full_page(page,
+                                                ext4_normal_get_block_write,
+                                                wbc);
        }
-        err = ext4_journal_stop(handle);
-        if (!ret)
-                ret = err;
-out:
-        return ret;
 no_write:
        redirty_page_for_writepage(wbc, page);
-out_unlock:
        unlock_page(page);
-        goto out;
+        return 0;
 }
 static int ext4_readpage(struct file *file, struct page *page)
@@ -1819,7 +2808,7 @@ static int ext4_journalled_set_page_dirty(struct page *page)
 static const struct address_space_operations ext4_ordered_aops = {
        .readpage       = ext4_readpage,
        .readpages      = ext4_readpages,
-        .writepage      = ext4_ordered_writepage,
+        .writepage      = ext4_normal_writepage,
        .sync_page      = block_sync_page,
        .write_begin    = ext4_write_begin,
        .write_end      = ext4_ordered_write_end,
@@ -1833,7 +2822,7 @@ static const struct address_space_operations ext4_ordered_aops = {
 static const struct address_space_operations ext4_writeback_aops = {
        .readpage       = ext4_readpage,
        .readpages      = ext4_readpages,
-        .writepage      = ext4_writeback_writepage,
+        .writepage      = ext4_normal_writepage,
        .sync_page      = block_sync_page,
        .write_begin    = ext4_write_begin,
        .write_end      = ext4_writeback_write_end,
@@ -1857,10 +2846,31 @@ static const struct address_space_operations ext4_journalled_aops = {
        .releasepage    = ext4_releasepage,
 };
+static const struct address_space_operations ext4_da_aops = {
+        .readpage       = ext4_readpage,
+        .readpages      = ext4_readpages,
+        .writepage      = ext4_da_writepage,
+        .writepages     = ext4_da_writepages,
+        .sync_page      = block_sync_page,
+        .write_begin    = ext4_da_write_begin,
+        .write_end      = ext4_da_write_end,
+        .bmap           = ext4_bmap,
+        .invalidatepage = ext4_da_invalidatepage,
+        .releasepage    = ext4_releasepage,
+        .direct_IO      = ext4_direct_IO,
+        .migratepage    = buffer_migrate_page,
+};
 void ext4_set_aops(struct inode *inode)
 {
-        if (ext4_should_order_data(inode))
+        if (ext4_should_order_data(inode) &&
+                test_opt(inode->i_sb, DELALLOC))
+                inode->i_mapping->a_ops = &ext4_da_aops;
+        else if (ext4_should_order_data(inode))
                inode->i_mapping->a_ops = &ext4_ordered_aops;
+        else if (ext4_should_writeback_data(inode) &&
+                 test_opt(inode->i_sb, DELALLOC))
+                inode->i_mapping->a_ops = &ext4_da_aops;
        else if (ext4_should_writeback_data(inode))
                inode->i_mapping->a_ops = &ext4_writeback_aops;
        else
@@ -1873,7 +2883,7 @@ void ext4_set_aops(struct inode *inode)
 * This required during truncate. We need to physically zero the tail end
 * of that block so it doesn't yield old data if the file is later grown.
 */
-int ext4_block_truncate_page(handle_t *handle, struct page *page,
+int ext4_block_truncate_page(handle_t *handle,
                struct address_space *mapping, loff_t from)
 {
        ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
@@ -1882,8 +2892,13 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
        ext4_lblk_t iblock;
        struct inode *inode = mapping->host;
        struct buffer_head *bh;
+        struct page *page;
        int err = 0;
+        page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT);
+        if (!page)
+                return -EINVAL;
        blocksize = inode->i_sb->s_blocksize;
        length = blocksize - (offset & (blocksize - 1));
        iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
@@ -1956,7 +2971,7 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
                err = ext4_journal_dirty_metadata(handle, bh);
        } else {
                if (ext4_should_order_data(inode))
-                        err = ext4_journal_dirty_data(handle, bh);
+                        err = ext4_jbd2_file_inode(handle, inode);
                mark_buffer_dirty(bh);
        }
@@ -2179,7 +3194,21 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
        if (this_bh) {
                BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata");
-                ext4_journal_dirty_metadata(handle, this_bh);
+                /*
+                 * The buffer head should have an attached journal head at this
+                 * point. However, if the data is corrupted and an indirect
+                 * block pointed to itself, it would have been detached when
+                 * the block was cleared. Check for this instead of OOPSing.
+                 */
+                if (bh2jh(this_bh))
+                        ext4_journal_dirty_metadata(handle, this_bh);
+                else
+                        ext4_error(inode->i_sb, __func__,
+                                   "circular indirect block detected, "
+                                   "inode=%lu, block=%llu",
+                                   inode->i_ino,
+                                   (unsigned long long) this_bh->b_blocknr);
        }
 }
@@ -2305,6 +3334,19 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
        }
 }
+int ext4_can_truncate(struct inode *inode)
+{
+        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+                return 0;
+        if (S_ISREG(inode->i_mode))
+                return 1;
+        if (S_ISDIR(inode->i_mode))
+                return 1;
+        if (S_ISLNK(inode->i_mode))
+                return !ext4_inode_is_fast_symlink(inode);
+        return 0;
+}
 /*
 * ext4_truncate()
 *
@@ -2347,51 +3389,25 @@ void ext4_truncate(struct inode *inode)
        int n;
        ext4_lblk_t last_block;
        unsigned blocksize = inode->i_sb->s_blocksize;
-        struct page *page;
-        if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+        if (!ext4_can_truncate(inode))
-            S_ISLNK(inode->i_mode)))
-                return;
-        if (ext4_inode_is_fast_symlink(inode))
-                return;
-        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
                return;
-        /*
-         * We have to lock the EOF page here, because lock_page() nests
-         * outside jbd2_journal_start().
-         */
-        if ((inode->i_size & (blocksize - 1)) == 0) {
-                /* Block boundary? Nothing to do */
-                page = NULL;
-        } else {
-                page = grab_cache_page(mapping,
-                                inode->i_size >> PAGE_CACHE_SHIFT);
-                if (!page)
-                        return;
-        }
        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
-                ext4_ext_truncate(inode, page);
+                ext4_ext_truncate(inode);
                return;
        }
        handle = start_transaction(inode);
-        if (IS_ERR(handle)) {
+        if (IS_ERR(handle))
-                if (page) {
-                        clear_highpage(page);
-                        flush_dcache_page(page);
-                        unlock_page(page);
-                        page_cache_release(page);
-                }
                return;         /* AKPM: return what? */
-        }
        last_block = (inode->i_size + blocksize-1)
                                        >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
-        if (page)
+        if (inode->i_size & (blocksize - 1))
-                ext4_block_truncate_page(handle, page, mapping, inode->i_size);
+                if (ext4_block_truncate_page(handle, mapping, inode->i_size))
+                        goto out_stop;
        n = ext4_block_to_path(inode, last_block, offsets, NULL);
        if (n == 0)
@@ -2410,6 +3426,11 @@ void ext4_truncate(struct inode *inode)
                goto out_stop;
        /*
+         * From here we block out all ext4_get_block() callers who want to
+         * modify the block allocation tree.
+         */
+        down_write(&ei->i_data_sem);
+        /*
         * The orphan list entry will now protect us from any crash which
         * occurs before the truncate completes, so it is now safe to propagate
         * the new, shorter inode size (held for now in i_size) into the
@@ -2418,12 +3439,6 @@ void ext4_truncate(struct inode *inode)
         */
        ei->i_disksize = inode->i_size;
-        /*
-         * From here we block out all ext4_get_block() callers who want to
-         * modify the block allocation tree.
-         */
-        down_write(&ei->i_data_sem);
        if (n == 1) {           /* direct blocks */
                ext4_free_data(handle, inode, NULL, i_data+offsets[0],
                               i_data + EXT4_NDIR_BLOCKS);
@@ -3107,7 +4122,14 @@ int ext4_write_inode(struct inode *inode, int wait)
 * be freed, so we have a strong guarantee that no future commit will
 * leave these blocks visible to the user.)
 *
- * Called with inode->sem down.
+ * Another thing we have to assure is that if we are in ordered mode
+ * and inode is still attached to the committing transaction, we must
+ * we start writeout of all the dirty pages which are being truncated.
+ * This way we are sure that all the data written in the previous
+ * transaction are already on disk (truncate waits for pages under
+ * writeback).
+ *
+ * Called with inode->i_mutex down.
 */
 int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 {
@@ -3173,6 +4195,22 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                if (!error)
                        error = rc;
                ext4_journal_stop(handle);
+                if (ext4_should_order_data(inode)) {
+                        error = ext4_begin_ordered_truncate(inode,
+                                                            attr->ia_size);
+                        if (error) {
+                                /* Do as much error cleanup as possible */
+                                handle = ext4_journal_start(inode, 3);
+                                if (IS_ERR(handle)) {
+                                        ext4_orphan_del(NULL, inode);
+                                        goto err_out;
+                                }
+                                ext4_orphan_del(handle, inode);
+                                ext4_journal_stop(handle);
+                                goto err_out;
+                        }
+                }
        }
        rc = inode_setattr(inode, attr);
@@ -3193,6 +4231,32 @@ err_out:
        return error;
 }
+int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
+                 struct kstat *stat)
+{
+        struct inode *inode;
+        unsigned long delalloc_blocks;
+        inode = dentry->d_inode;
+        generic_fillattr(inode, stat);
+        /*
+         * We can't update i_blocks if the block allocation is delayed
+         * otherwise in the case of system crash before the real block
+         * allocation is done, we will have i_blocks inconsistent with
+         * on-disk file blocks.
+         * We always keep i_blocks updated together with real
+         * allocation. But to not confuse with user, stat
+         * will return the blocks that include the delayed allocation
+         * blocks for this file.
+         */
+        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+        delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
+        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+        stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
+        return 0;
+}
 /*
 * How many blocks doth make a writepage()?
@@ -3506,3 +4570,64 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
        return err;
 }
+static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
+{
+        return !buffer_mapped(bh);
+}
+int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+        loff_t size;
+        unsigned long len;
+        int ret = -EINVAL;
+        struct file *file = vma->vm_file;
+        struct inode *inode = file->f_path.dentry->d_inode;
+        struct address_space *mapping = inode->i_mapping;
+        /*
+         * Get i_alloc_sem to stop truncates messing with the inode. We cannot
+         * get i_mutex because we are already holding mmap_sem.
+         */
+        down_read(&inode->i_alloc_sem);
+        size = i_size_read(inode);
+        if (page->mapping != mapping || size <= page_offset(page)
+            || !PageUptodate(page)) {
+                /* page got truncated from under us? */
+                goto out_unlock;
+        }
+        ret = 0;
+        if (PageMappedToDisk(page))
+                goto out_unlock;
+        if (page->index == size >> PAGE_CACHE_SHIFT)
+                len = size & ~PAGE_CACHE_MASK;
+        else
+                len = PAGE_CACHE_SIZE;
+        if (page_has_buffers(page)) {
+                /* return if we have all the buffers mapped */
+                if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+                                       ext4_bh_unmapped))
+                        goto out_unlock;
+        }
+        /*
+         * OK, we need to fill the hole... Do write_begin write_end
+         * to do block allocation/reservation.We are not holding
+         * inode.i__mutex here. That allow * parallel write_begin,
+         * write_end call. lock_page prevent this from happening
+         * on the same page though
+         */
+        ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
+                        len, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
+        if (ret < 0)
+                goto out_unlock;
+        ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
+                        len, len, page, NULL);
+        if (ret < 0)
+                goto out_unlock;
+        ret = 0;
+out_unlock:
+        up_read(&inode->i_alloc_sem);
+        return ret;
+}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index c9900aade150..8d141a25bbee 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -381,22 +381,28 @@ static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
 static inline int mb_find_next_zero_bit(void *addr, int max, int start)
 {
-        int fix = 0;
+        int fix = 0, ret, tmpmax;
        addr = mb_correct_addr_and_bit(&fix, addr);
-        max += fix;
+        tmpmax = max + fix;
        start += fix;
-        return ext4_find_next_zero_bit(addr, max, start) - fix;
+        ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
+        if (ret > max)
+                return max;
+        return ret;
 }
 static inline int mb_find_next_bit(void *addr, int max, int start)
 {
-        int fix = 0;
+        int fix = 0, ret, tmpmax;
        addr = mb_correct_addr_and_bit(&fix, addr);
-        max += fix;
+        tmpmax = max + fix;
        start += fix;
-        return ext4_find_next_bit(addr, max, start) - fix;
+        ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
+        if (ret > max)
+                return max;
+        return ret;
 }
 static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
@@ -803,6 +809,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
                if (!buffer_uptodate(bh[i]))
                        goto out;
+        err = 0;
        first_block = page->index * blocks_per_page;
        for (i = 0; i < blocks_per_page; i++) {
                int group;
@@ -883,6 +890,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
        int pnum;
        int poff;
        struct page *page;
+        int ret;
        mb_debug("load group %lu\n", group);
@@ -914,15 +922,21 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
                if (page) {
                        BUG_ON(page->mapping != inode->i_mapping);
                        if (!PageUptodate(page)) {
-                                ext4_mb_init_cache(page, NULL);
+                                ret = ext4_mb_init_cache(page, NULL);
+                                if (ret) {
+                                        unlock_page(page);
+                                        goto err;
+                                }
                                mb_cmp_bitmaps(e4b, page_address(page) +
                                               (poff * sb->s_blocksize));
                        }
                        unlock_page(page);
                }
        }
-        if (page == NULL || !PageUptodate(page))
+        if (page == NULL || !PageUptodate(page)) {
+                ret = -EIO;
                goto err;
+        }
        e4b->bd_bitmap_page = page;
        e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
        mark_page_accessed(page);
@@ -938,14 +952,20 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
                page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
                if (page) {
                        BUG_ON(page->mapping != inode->i_mapping);
-                        if (!PageUptodate(page))
+                        if (!PageUptodate(page)) {
-                                ext4_mb_init_cache(page, e4b->bd_bitmap);
+                                ret = ext4_mb_init_cache(page, e4b->bd_bitmap);
+                                if (ret) {
+                                        unlock_page(page);
+                                        goto err;
+                                }
+                        }
                        unlock_page(page);
                }
        }
-        if (page == NULL || !PageUptodate(page))
+        if (page == NULL || !PageUptodate(page)) {
+                ret = -EIO;
                goto err;
+        }
        e4b->bd_buddy_page = page;
        e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
        mark_page_accessed(page);
@@ -962,7 +982,7 @@ err:
                page_cache_release(e4b->bd_buddy_page);
        e4b->bd_buddy = NULL;
        e4b->bd_bitmap = NULL;
-        return -EIO;
+        return ret;
 }
 static void ext4_mb_release_desc(struct ext4_buddy *e4b)
@@ -1031,7 +1051,7 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
        }
 }
-static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
+static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
                          int first, int count)
 {
        int block = 0;
@@ -1071,11 +1091,12 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
                        blocknr += block;
                        blocknr +=
                            le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
+                        ext4_unlock_group(sb, e4b->bd_group);
                        ext4_error(sb, __func__, "double-free of inode"
                                   " %lu's block %llu(bit %u in group %lu)\n",
                                   inode ? inode->i_ino : 0, blocknr, block,
                                   e4b->bd_group);
+                        ext4_lock_group(sb, e4b->bd_group);
                }
                mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
                e4b->bd_info->bb_counters[order]++;
@@ -1113,8 +1134,6 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
                } while (1);
        }
        mb_check_buddy(e4b);
-        return 0;
 }
 static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
@@ -1730,10 +1749,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
                ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
                spin_unlock(&sbi->s_md_lock);
        }
-        /* searching for the right group start from the goal value specified */
-        group = ac->ac_g_ex.fe_group;
        /* Let's just scan groups to find more-less suitable blocks */
        cr = ac->ac_2order ? 0 : 1;
        /*
@@ -1743,6 +1758,12 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 repeat:
        for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
                ac->ac_criteria = cr;
+                /*
+                 * searching for the right group start
+                 * from the goal value specified
+                 */
+                group = ac->ac_g_ex.fe_group;
                for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) {
                        struct ext4_group_info *grp;
                        struct ext4_group_desc *desc;
@@ -1963,6 +1984,8 @@ static int ext4_mb_seq_history_open(struct inode *inode, struct file *file)
        int rc;
        int size;
+        if (unlikely(sbi->s_mb_history == NULL))
+                return -ENOMEM;
        s = kmalloc(sizeof(*s), GFP_KERNEL);
        if (s == NULL)
                return -ENOMEM;
@@ -2165,9 +2188,7 @@ static void ext4_mb_history_init(struct super_block *sb)
        sbi->s_mb_history_cur = 0;
        spin_lock_init(&sbi->s_mb_history_lock);
        i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
-        sbi->s_mb_history = kmalloc(i, GFP_KERNEL);
+        sbi->s_mb_history = kzalloc(i, GFP_KERNEL);
-        if (likely(sbi->s_mb_history != NULL))
-                memset(sbi->s_mb_history, 0, i);
        /* if we can't allocate history, then we simple won't use it */
 }
@@ -2215,21 +2236,192 @@ ext4_mb_store_history(struct ext4_allocation_context *ac)
 #define ext4_mb_history_init(sb)
 #endif
+/* Create and initialize ext4_group_info data for the given group. */
+int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
+                          struct ext4_group_desc *desc)
+{
+        int i, len;
+        int metalen = 0;
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_group_info **meta_group_info;
+        /*
+         * First check if this group is the first of a reserved block.
+         * If it's true, we have to allocate a new table of pointers
+         * to ext4_group_info structures
+         */
+        if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
+                metalen = sizeof(*meta_group_info) <<
+                        EXT4_DESC_PER_BLOCK_BITS(sb);
+                meta_group_info = kmalloc(metalen, GFP_KERNEL);
+                if (meta_group_info == NULL) {
+                        printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
+                               "buddy group\n");
+                        goto exit_meta_group_info;
+                }
+                sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
+                        meta_group_info;
+        }
+        /*
+         * calculate needed size. if change bb_counters size,
+         * don't forget about ext4_mb_generate_buddy()
+         */
+        len = offsetof(typeof(**meta_group_info),
+                       bb_counters[sb->s_blocksize_bits + 2]);
+        meta_group_info =
+                sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
+        i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
+        meta_group_info[i] = kzalloc(len, GFP_KERNEL);
+        if (meta_group_info[i] == NULL) {
+                printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
+                goto exit_group_info;
+        }
+        set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
+                &(meta_group_info[i]->bb_state));
+        /*
+         * initialize bb_free to be able to skip
+         * empty groups without initialization
+         */
+        if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+                meta_group_info[i]->bb_free =
+                        ext4_free_blocks_after_init(sb, group, desc);
+        } else {
+                meta_group_info[i]->bb_free =
+                        le16_to_cpu(desc->bg_free_blocks_count);
+        }
+        INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
+#ifdef DOUBLE_CHECK
+        {
+                struct buffer_head *bh;
+                meta_group_info[i]->bb_bitmap =
+                        kmalloc(sb->s_blocksize, GFP_KERNEL);
+                BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
+                bh = ext4_read_block_bitmap(sb, group);
+                BUG_ON(bh == NULL);
+                memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
+                        sb->s_blocksize);
+                put_bh(bh);
+        }
+#endif
+        return 0;
+exit_group_info:
+        /* If a meta_group_info table has been allocated, release it now */
+        if (group % EXT4_DESC_PER_BLOCK(sb) == 0)
+                kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
+exit_meta_group_info:
+        return -ENOMEM;
+} /* ext4_mb_add_groupinfo */
+/*
+ * Add a group to the existing groups.
+ * This function is used for online resize
+ */
+int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group,
+                               struct ext4_group_desc *desc)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct inode *inode = sbi->s_buddy_cache;
+        int blocks_per_page;
+        int block;
+        int pnum;
+        struct page *page;
+        int err;
+        /* Add group based on group descriptor*/
+        err = ext4_mb_add_groupinfo(sb, group, desc);
+        if (err)
+                return err;
+        /*
+         * Cache pages containing dynamic mb_alloc datas (buddy and bitmap
+         * datas) are set not up to date so that they will be re-initilaized
+         * during the next call to ext4_mb_load_buddy
+         */
+        /* Set buddy page as not up to date */
+        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+        block = group * 2;
+        pnum = block / blocks_per_page;
+        page = find_get_page(inode->i_mapping, pnum);
+        if (page != NULL) {
+                ClearPageUptodate(page);
+                page_cache_release(page);
+        }
+        /* Set bitmap page as not up to date */
+        block++;
+        pnum = block / blocks_per_page;
+        page = find_get_page(inode->i_mapping, pnum);
+        if (page != NULL) {
+                ClearPageUptodate(page);
+                page_cache_release(page);
+        }
+        return 0;
+}
+/*
+ * Update an existing group.
+ * This function is used for online resize
+ */
+void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)
+{
+        grp->bb_free += add;
+}
 static int ext4_mb_init_backend(struct super_block *sb)
 {
        ext4_group_t i;
-        int j, len, metalen;
+        int metalen;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        int num_meta_group_infos =
+        struct ext4_super_block *es = sbi->s_es;
-                (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) >>
+        int num_meta_group_infos;
-                        EXT4_DESC_PER_BLOCK_BITS(sb);
+        int num_meta_group_infos_max;
+        int array_size;
        struct ext4_group_info **meta_group_info;
+        struct ext4_group_desc *desc;
+        /* This is the number of blocks used by GDT */
+        num_meta_group_infos = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) -
+                                1) >> EXT4_DESC_PER_BLOCK_BITS(sb);
+        /*
+         * This is the total number of blocks used by GDT including
+         * the number of reserved blocks for GDT.
+         * The s_group_info array is allocated with this value
+         * to allow a clean online resize without a complex
+         * manipulation of pointer.
+         * The drawback is the unused memory when no resize
+         * occurs but it's very low in terms of pages
+         * (see comments below)
+         * Need to handle this properly when META_BG resizing is allowed
+         */
+        num_meta_group_infos_max = num_meta_group_infos +
+                                le16_to_cpu(es->s_reserved_gdt_blocks);
+        /*
+         * array_size is the size of s_group_info array. We round it
+         * to the next power of two because this approximation is done
+         * internally by kmalloc so we can have some more memory
+         * for free here (e.g. may be used for META_BG resize).
+         */
+        array_size = 1;
+        while (array_size < sizeof(*sbi->s_group_info) *
+               num_meta_group_infos_max)
+                array_size = array_size << 1;
        /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
         * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
         * So a two level scheme suffices for now. */
-        sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) *
+        sbi->s_group_info = kmalloc(array_size, GFP_KERNEL);
-                                    num_meta_group_infos, GFP_KERNEL);
        if (sbi->s_group_info == NULL) {
                printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
                return -ENOMEM;
@@ -2256,63 +2448,15 @@ static int ext4_mb_init_backend(struct super_block *sb)
                sbi->s_group_info[i] = meta_group_info;
        }
-        /*
-         * calculate needed size. if change bb_counters size,
-         * don't forget about ext4_mb_generate_buddy()
-         */
-        len = sizeof(struct ext4_group_info);
-        len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2);
        for (i = 0; i < sbi->s_groups_count; i++) {
-                struct ext4_group_desc *desc;
-                meta_group_info =
-                        sbi->s_group_info[i >> EXT4_DESC_PER_BLOCK_BITS(sb)];
-                j = i & (EXT4_DESC_PER_BLOCK(sb) - 1);
-                meta_group_info[j] = kzalloc(len, GFP_KERNEL);
-                if (meta_group_info[j] == NULL) {
-                        printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
-                        goto err_freebuddy;
-                }
                desc = ext4_get_group_desc(sb, i, NULL);
                if (desc == NULL) {
                        printk(KERN_ERR
                                "EXT4-fs: can't read descriptor %lu\n", i);
-                        i++;
                        goto err_freebuddy;
                }
-                memset(meta_group_info[j], 0, len);
+                if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
-                set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
+                        goto err_freebuddy;
-                        &(meta_group_info[j]->bb_state));
-                /*
-                 * initialize bb_free to be able to skip
-                 * empty groups without initialization
-                 */
-                if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
-                        meta_group_info[j]->bb_free =
-                                ext4_free_blocks_after_init(sb, i, desc);
-                } else {
-                        meta_group_info[j]->bb_free =
-                                le16_to_cpu(desc->bg_free_blocks_count);
-                }
-                INIT_LIST_HEAD(&meta_group_info[j]->bb_prealloc_list);
-#ifdef DOUBLE_CHECK
-                {
-                        struct buffer_head *bh;
-                        meta_group_info[j]->bb_bitmap =
-                                kmalloc(sb->s_blocksize, GFP_KERNEL);
-                        BUG_ON(meta_group_info[j]->bb_bitmap == NULL);
-                        bh = read_block_bitmap(sb, i);
-                        BUG_ON(bh == NULL);
-                        memcpy(meta_group_info[j]->bb_bitmap, bh->b_data,
-                                        sb->s_blocksize);
-                        put_bh(bh);
-                }
-#endif
        }
        return 0;
@@ -2336,6 +2480,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        unsigned i;
        unsigned offset;
        unsigned max;
+        int ret;
        if (!test_opt(sb, MBALLOC))
                return 0;
@@ -2370,12 +2515,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        } while (i <= sb->s_blocksize_bits + 1);
        /* init file for buddy data */
-        i = ext4_mb_init_backend(sb);
+        ret = ext4_mb_init_backend(sb);
-        if (i) {
+        if (ret != 0) {
                clear_opt(sbi->s_mount_opt, MBALLOC);
                kfree(sbi->s_mb_offsets);
                kfree(sbi->s_mb_maxs);
-                return i;
+                return ret;
        }
        spin_lock_init(&sbi->s_md_lock);
@@ -2548,8 +2693,7 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
                ext4_lock_group(sb, md->group);
                for (i = 0; i < md->num; i++) {
                        mb_debug(" %u", md->blocks[i]);
-                        err = mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
+                        mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
-                        BUG_ON(err != 0);
                }
                mb_debug("\n");
                ext4_unlock_group(sb, md->group);
@@ -2575,25 +2719,24 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
-#define MB_PROC_VALUE_READ(name)                                \
+#define MB_PROC_FOPS(name)                                      \
-static int ext4_mb_read_##name(char *page, char **start,        \
+static int ext4_mb_##name##_proc_show(struct seq_file *m, void *v)      \
-                off_t off, int count, int *eof, void *data)     \
 {                                                               \
-        struct ext4_sb_info *sbi = data;                        \
+        struct ext4_sb_info *sbi = m->private;                  \
-        int len;                                                \
+                                                                \
-        *eof = 1;                                               \
+        seq_printf(m, "%ld\n", sbi->s_mb_##name);               \
-        if (off != 0)                                           \
+        return 0;                                               \
-                return 0;                                       \
+}                                                               \
-        len = sprintf(page, "%ld\n", sbi->s_mb_##name);         \
+                                                                \
-        *start = page;                                          \
+static int ext4_mb_##name##_proc_open(struct inode *inode, struct file *file)\
-        return len;                                             \
+{                                                               \
-}
+        return single_open(file, ext4_mb_##name##_proc_show, PDE(inode)->data);\
+}                                                               \
-#define MB_PROC_VALUE_WRITE(name)                               \
+                                                                \
-static int ext4_mb_write_##name(struct file *file,              \
+static ssize_t ext4_mb_##name##_proc_write(struct file *file,   \
-                const char __user *buf, unsigned long cnt, void *data)  \
+                const char __user *buf, size_t cnt, loff_t *ppos)       \
 {                                                               \
-        struct ext4_sb_info *sbi = data;                        \
+        struct ext4_sb_info *sbi = PDE(file->f_path.dentry->d_inode)->data;\
        char str[32];                                           \
        long value;                                             \
        if (cnt >= sizeof(str))                                 \
@@ -2605,31 +2748,32 @@ static int ext4_mb_write_##name(struct file *file,		\
                return -ERANGE;                                 \
        sbi->s_mb_##name = value;                               \
        return cnt;                                             \
-}
+}                                                               \
+                                                                \
+static const struct file_operations ext4_mb_##name##_proc_fops = {      \
+        .owner          = THIS_MODULE,                          \
+        .open           = ext4_mb_##name##_proc_open,           \
+        .read           = seq_read,                             \
+        .llseek         = seq_lseek,                            \
+        .release        = single_release,                       \
+        .write          = ext4_mb_##name##_proc_write,          \
+};
-MB_PROC_VALUE_READ(stats);
+MB_PROC_FOPS(stats);
-MB_PROC_VALUE_WRITE(stats);
+MB_PROC_FOPS(max_to_scan);
-MB_PROC_VALUE_READ(max_to_scan);
+MB_PROC_FOPS(min_to_scan);
-MB_PROC_VALUE_WRITE(max_to_scan);
+MB_PROC_FOPS(order2_reqs);
-MB_PROC_VALUE_READ(min_to_scan);
+MB_PROC_FOPS(stream_request);
-MB_PROC_VALUE_WRITE(min_to_scan);
+MB_PROC_FOPS(group_prealloc);
-MB_PROC_VALUE_READ(order2_reqs);
-MB_PROC_VALUE_WRITE(order2_reqs);
-MB_PROC_VALUE_READ(stream_request);
-MB_PROC_VALUE_WRITE(stream_request);
-MB_PROC_VALUE_READ(group_prealloc);
-MB_PROC_VALUE_WRITE(group_prealloc);
 #define MB_PROC_HANDLER(name, var)                                      \
 do {                                                                    \
-        proc = create_proc_entry(name, mode, sbi->s_mb_proc);           \
+        proc = proc_create_data(name, mode, sbi->s_mb_proc,             \
+                                &ext4_mb_##var##_proc_fops, sbi);       \
        if (proc == NULL) {                                             \
                printk(KERN_ERR "EXT4-fs: can't to create %s\n", name); \
                goto err_out;                                           \
        }                                                               \
-        proc->data = sbi;                                               \
-        proc->read_proc  = ext4_mb_read_##var ;                         \
-        proc->write_proc = ext4_mb_write_##var;                         \
 } while (0)
 static int ext4_mb_init_per_dev_proc(struct super_block *sb)
@@ -2639,6 +2783,10 @@ static int ext4_mb_init_per_dev_proc(struct super_block *sb)
        struct proc_dir_entry *proc;
        char devname[64];
+        if (proc_root_ext4 == NULL) {
+                sbi->s_mb_proc = NULL;
+                return -EINVAL;
+        }
        bdevname(sb->s_bdev, devname);
        sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4);
@@ -2747,7 +2895,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
        err = -EIO;
-        bitmap_bh = read_block_bitmap(sb, ac->ac_b_ex.fe_group);
+        bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
        if (!bitmap_bh)
                goto out_err;
@@ -2816,7 +2964,23 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
        le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len);
        gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
        spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
-        percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
+        /*
+         * free blocks account has already be reduced/reserved
+         * at write_begin() time for delayed allocation
+         * do not double accounting
+         */
+        if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
+                percpu_counter_sub(&sbi->s_freeblocks_counter,
+                                        ac->ac_b_ex.fe_len);
+        if (sbi->s_log_groups_per_flex) {
+                ext4_group_t flex_group = ext4_flex_group(sbi,
+                                                          ac->ac_b_ex.fe_group);
+                spin_lock(sb_bgl_lock(sbi, flex_group));
+                sbi->s_flex_groups[flex_group].free_blocks -= ac->ac_b_ex.fe_len;
+                spin_unlock(sb_bgl_lock(sbi, flex_group));
+        }
        err = ext4_journal_dirty_metadata(handle, bitmap_bh);
        if (err)
@@ -3473,8 +3637,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
                if (bit >= end)
                        break;
                next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
-                if (next > end)
-                        next = end;
                start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
                                le32_to_cpu(sbi->s_es->s_first_data_block);
                mb_debug("    free preallocated %u/%u in group %u\n",
@@ -3569,7 +3731,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
        if (list_empty(&grp->bb_prealloc_list))
                return 0;
-        bitmap_bh = read_block_bitmap(sb, group);
+        bitmap_bh = ext4_read_block_bitmap(sb, group);
        if (bitmap_bh == NULL) {
                /* error handling here */
                ext4_mb_release_desc(&e4b);
@@ -3743,7 +3905,7 @@ repeat:
                err = ext4_mb_load_buddy(sb, group, &e4b);
                BUG_ON(err != 0); /* error handling here */
-                bitmap_bh = read_block_bitmap(sb, group);
+                bitmap_bh = ext4_read_block_bitmap(sb, group);
                if (bitmap_bh == NULL) {
                        /* error handling here */
                        ext4_mb_release_desc(&e4b);
@@ -4011,10 +4173,21 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
        sbi = EXT4_SB(sb);
        if (!test_opt(sb, MBALLOC)) {
-                block = ext4_new_blocks_old(handle, ar->inode, ar->goal,
+                block = ext4_old_new_blocks(handle, ar->inode, ar->goal,
                                            &(ar->len), errp);
                return block;
        }
+        if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) {
+                /*
+                 * With delalloc we already reserved the blocks
+                 */
+                ar->len = ext4_has_free_blocks(sbi, ar->len);
+        }
+        if (ar->len == 0) {
+                *errp = -ENOSPC;
+                return 0;
+        }
        while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
                ar->flags |= EXT4_MB_HINT_NOPREALLOC;
@@ -4026,10 +4199,14 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
        }
        inquota = ar->len;
+        if (EXT4_I(ar->inode)->i_delalloc_reserved_flag)
+                ar->flags |= EXT4_MB_DELALLOC_RESERVED;
        ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
        if (!ac) {
+                ar->len = 0;
                *errp = -ENOMEM;
-                return 0;
+                goto out1;
        }
        ext4_mb_poll_new_transaction(sb, handle);
@@ -4037,12 +4214,11 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
        *errp = ext4_mb_initialize_context(ac, ar);
        if (*errp) {
                ar->len = 0;
-                goto out;
+                goto out2;
        }
        ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
        if (!ext4_mb_use_preallocated(ac)) {
                ac->ac_op = EXT4_MB_HISTORY_ALLOC;
                ext4_mb_normalize_request(ac, ar);
 repeat:
@@ -4085,11 +4261,12 @@ repeat:
        ext4_mb_release_context(ac);
-out:
+out2:
+        kmem_cache_free(ext4_ac_cachep, ac);
+out1:
        if (ar->len < inquota)
                DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len);
-        kmem_cache_free(ext4_ac_cachep, ac);
        return block;
 }
 static void ext4_mb_poll_new_transaction(struct super_block *sb,
@@ -4242,7 +4419,7 @@ do_more:
                overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
                count -= overflow;
        }
-        bitmap_bh = read_block_bitmap(sb, block_group);
+        bitmap_bh = ext4_read_block_bitmap(sb, block_group);
        if (!bitmap_bh)
                goto error_return;
        gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
@@ -4309,10 +4486,9 @@ do_more:
                ext4_mb_free_metadata(handle, &e4b, block_group, bit, count);
        } else {
                ext4_lock_group(sb, block_group);
-                err = mb_free_blocks(inode, &e4b, bit, count);
+                mb_free_blocks(inode, &e4b, bit, count);
                ext4_mb_return_to_preallocation(inode, &e4b, block, count);
                ext4_unlock_group(sb, block_group);
-                BUG_ON(err != 0);
        }
        spin_lock(sb_bgl_lock(sbi, block_group));
@@ -4321,6 +4497,13 @@ do_more:
        spin_unlock(sb_bgl_lock(sbi, block_group));
        percpu_counter_add(&sbi->s_freeblocks_counter, count);
+        if (sbi->s_log_groups_per_flex) {
+                ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+                spin_lock(sb_bgl_lock(sbi, flex_group));
+                sbi->s_flex_groups[flex_group].free_blocks += count;
+                spin_unlock(sb_bgl_lock(sbi, flex_group));
+        }
        ext4_mb_release_desc(&e4b);
        *freed += count;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index ab16beaa830d..387ad98350c3 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -183,6 +183,16 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                             struct inode *inode);
 /*
+ * p is at least 6 bytes before the end of page
+ */
+static inline struct ext4_dir_entry_2 *
+ext4_next_entry(struct ext4_dir_entry_2 *p)
+{
+        return (struct ext4_dir_entry_2 *)((char *)p +
+                ext4_rec_len_from_disk(p->rec_len));
+}
+/*
 * Future: use high four bits of block for coalesce-on-delete flags
 * Mask them off for now.
 */
@@ -231,13 +241,13 @@ static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
 {
        unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
                EXT4_DIR_REC_LEN(2) - infosize;
-        return 0? 20: entry_space / sizeof(struct dx_entry);
+        return entry_space / sizeof(struct dx_entry);
 }
 static inline unsigned dx_node_limit (struct inode *dir)
 {
        unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
-        return 0? 22: entry_space / sizeof(struct dx_entry);
+        return entry_space / sizeof(struct dx_entry);
 }
 /*
@@ -554,15 +564,6 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
 /*
- * p is at least 6 bytes before the end of page
- */
-static inline struct ext4_dir_entry_2 *ext4_next_entry(struct ext4_dir_entry_2 *p)
-{
-        return (struct ext4_dir_entry_2 *)((char *)p +
-                ext4_rec_len_from_disk(p->rec_len));
-}
-/*
 * This function fills a red-black tree with information from a
 * directory block.  It returns the number directory entries loaded
 * into the tree.  If there is an error it is returned in err.
@@ -993,19 +994,21 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
                de = (struct ext4_dir_entry_2 *) bh->b_data;
                top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
                                       EXT4_DIR_REC_LEN(0));
-                for (; de < top; de = ext4_next_entry(de))
+                for (; de < top; de = ext4_next_entry(de)) {
-                if (ext4_match (namelen, name, de)) {
+                        int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
-                        if (!ext4_check_dir_entry("ext4_find_entry",
+                                  + ((char *) de - bh->b_data);
-                                                  dir, de, bh,
-                                  (block<<EXT4_BLOCK_SIZE_BITS(sb))
+                        if (!ext4_check_dir_entry(__func__, dir, de, bh, off)) {
-                                          +((char *)de - bh->b_data))) {
+                                brelse(bh);
-                                brelse (bh);
                                *err = ERR_BAD_DX_DIR;
                                goto errout;
                        }
-                        *res_dir = de;
-                        dx_release (frames);
+                        if (ext4_match(namelen, name, de)) {
-                        return bh;
+                                *res_dir = de;
+                                dx_release(frames);
+                                return bh;
+                        }
                }
                brelse (bh);
                /* Check to see if we should continue to search */
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 9ff7b1c04239..f000fbe2cd93 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -866,6 +866,15 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
        /*
+         * We can allocate memory for mb_alloc based on the new group
+         * descriptor
+         */
+        if (test_opt(sb, MBALLOC)) {
+                err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
+                if (err)
+                        goto exit_journal;
+        }
+        /*
         * Make the new blocks and inodes valid next.  We do this before
         * increasing the group count so that once the group is enabled,
         * all of its blocks and inodes are already valid.
@@ -957,6 +966,8 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
        handle_t *handle;
        int err;
        unsigned long freed_blocks;
+        ext4_group_t group;
+        struct ext4_group_info *grp;
        /* We don't need to worry about locking wrt other resizers just
         * yet: we're going to revalidate es->s_blocks_count after
@@ -988,7 +999,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
        }
        /* Handle the remaining blocks in the last group only. */
-        ext4_get_group_no_and_offset(sb, o_blocks_count, NULL, &last);
+        ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
        if (last == 0) {
                ext4_warning(sb, __func__,
@@ -1060,6 +1071,45 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
                   o_blocks_count + add);
        if ((err = ext4_journal_stop(handle)))
                goto exit_put;
+        /*
+         * Mark mballoc pages as not up to date so that they will be updated
+         * next time they are loaded by ext4_mb_load_buddy.
+         */
+        if (test_opt(sb, MBALLOC)) {
+                struct ext4_sb_info *sbi = EXT4_SB(sb);
+                struct inode *inode = sbi->s_buddy_cache;
+                int blocks_per_page;
+                int block;
+                int pnum;
+                struct page *page;
+                /* Set buddy page as not up to date */
+                blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+                block = group * 2;
+                pnum = block / blocks_per_page;
+                page = find_get_page(inode->i_mapping, pnum);
+                if (page != NULL) {
+                        ClearPageUptodate(page);
+                        page_cache_release(page);
+                }
+                /* Set bitmap page as not up to date */
+                block++;
+                pnum = block / blocks_per_page;
+                page = find_get_page(inode->i_mapping, pnum);
+                if (page != NULL) {
+                        ClearPageUptodate(page);
+                        page_cache_release(page);
+                }
+                /* Get the info on the last group */
+                grp = ext4_get_group_info(sb, group);
+                /* Update free blocks in group info */
+                ext4_mb_update_group_info(grp, add);
+        }
        if (test_opt(sb, DEBUG))
                printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
                       ext4_blocks_count(es));
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 02bf24343979..1cb371dcd609 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -506,6 +506,7 @@ static void ext4_put_super (struct super_block * sb)
        ext4_ext_release(sb);
        ext4_xattr_put_super(sb);
        jbd2_journal_destroy(sbi->s_journal);
+        sbi->s_journal = NULL;
        if (!(sb->s_flags & MS_RDONLY)) {
                EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
                es->s_state = cpu_to_le16(sbi->s_mount_state);
@@ -517,6 +518,7 @@ static void ext4_put_super (struct super_block * sb)
        for (i = 0; i < sbi->s_gdb_count; i++)
                brelse(sbi->s_group_desc[i]);
        kfree(sbi->s_group_desc);
+        kfree(sbi->s_flex_groups);
        percpu_counter_destroy(&sbi->s_freeblocks_counter);
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
        percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -571,6 +573,12 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
        INIT_LIST_HEAD(&ei->i_prealloc_list);
        spin_lock_init(&ei->i_prealloc_lock);
+        jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
+        ei->i_reserved_data_blocks = 0;
+        ei->i_reserved_meta_blocks = 0;
+        ei->i_allocated_meta_blocks = 0;
+        ei->i_delalloc_reserved_flag = 0;
+        spin_lock_init(&(ei->i_block_reservation_lock));
        return &ei->vfs_inode;
 }
@@ -635,6 +643,8 @@ static void ext4_clear_inode(struct inode *inode)
        EXT4_I(inode)->i_block_alloc_info = NULL;
        if (unlikely(rsv))
                kfree(rsv);
+        jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
+                                       &EXT4_I(inode)->jinode);
 }
 static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb)
@@ -671,7 +681,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
        unsigned long def_mount_opts;
        struct super_block *sb = vfs->mnt_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        journal_t *journal = sbi->s_journal;
        struct ext4_super_block *es = sbi->s_es;
        def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
@@ -747,6 +756,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_puts(seq, ",nomballoc");
        if (test_opt(sb, I_VERSION))
                seq_puts(seq, ",i_version");
+        if (!test_opt(sb, DELALLOC))
+                seq_puts(seq, ",nodelalloc");
        if (sbi->s_stripe)
                seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
@@ -894,7 +906,7 @@ enum {
        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
        Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
        Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
-        Opt_mballoc, Opt_nomballoc, Opt_stripe,
+        Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc,
 };
 static match_table_t tokens = {
@@ -953,6 +965,8 @@ static match_table_t tokens = {
        {Opt_nomballoc, "nomballoc"},
        {Opt_stripe, "stripe=%u"},
        {Opt_resize, "resize"},
+        {Opt_delalloc, "delalloc"},
+        {Opt_nodelalloc, "nodelalloc"},
        {Opt_err, NULL},
 };
@@ -990,6 +1004,7 @@ static int parse_options (char *options, struct super_block *sb,
        int qtype, qfmt;
        char *qname;
 #endif
+        ext4_fsblk_t last_block;
        if (!options)
                return 1;
@@ -1309,15 +1324,39 @@ set_qf_format:
                        clear_opt(sbi->s_mount_opt, NOBH);
                        break;
                case Opt_extents:
+                        if (!EXT4_HAS_INCOMPAT_FEATURE(sb,
+                                        EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+                                ext4_warning(sb, __func__,
+                                        "extents feature not enabled "
+                                        "on this filesystem, use tune2fs\n");
+                                return 0;
+                        }
                        set_opt (sbi->s_mount_opt, EXTENTS);
                        break;
                case Opt_noextents:
+                        /*
+                         * When e2fsprogs support resizing an already existing
+                         * ext3 file system to greater than 2**32 we need to
+                         * add support to block allocator to handle growing
+                         * already existing block  mapped inode so that blocks
+                         * allocated for them fall within 2**32
+                         */
+                        last_block = ext4_blocks_count(sbi->s_es) - 1;
+                        if (last_block  > 0xffffffffULL) {
+                                printk(KERN_ERR "EXT4-fs: Filesystem too "
+                                                "large to mount with "
+                                                "-o noextents options\n");
+                                return 0;
+                        }
                        clear_opt (sbi->s_mount_opt, EXTENTS);
                        break;
                case Opt_i_version:
                        set_opt(sbi->s_mount_opt, I_VERSION);
                        sb->s_flags |= MS_I_VERSION;
                        break;
+                case Opt_nodelalloc:
+                        clear_opt(sbi->s_mount_opt, DELALLOC);
+                        break;
                case Opt_mballoc:
                        set_opt(sbi->s_mount_opt, MBALLOC);
                        break;
@@ -1331,6 +1370,9 @@ set_qf_format:
                                return 0;
                        sbi->s_stripe = option;
                        break;
+                case Opt_delalloc:
+                        set_opt(sbi->s_mount_opt, DELALLOC);
+                        break;
                default:
                        printk (KERN_ERR
                                "EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1443,6 +1485,54 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
        return res;
 }
+static int ext4_fill_flex_info(struct super_block *sb)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_group_desc *gdp = NULL;
+        struct buffer_head *bh;
+        ext4_group_t flex_group_count;
+        ext4_group_t flex_group;
+        int groups_per_flex = 0;
+        __u64 block_bitmap = 0;
+        int i;
+        if (!sbi->s_es->s_log_groups_per_flex) {
+                sbi->s_log_groups_per_flex = 0;
+                return 1;
+        }
+        sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
+        groups_per_flex = 1 << sbi->s_log_groups_per_flex;
+        flex_group_count = (sbi->s_groups_count + groups_per_flex - 1) /
+                groups_per_flex;
+        sbi->s_flex_groups = kmalloc(flex_group_count *
+                                     sizeof(struct flex_groups), GFP_KERNEL);
+        if (sbi->s_flex_groups == NULL) {
+                printk(KERN_ERR "EXT4-fs: not enough memory\n");
+                goto failed;
+        }
+        memset(sbi->s_flex_groups, 0, flex_group_count *
+               sizeof(struct flex_groups));
+        gdp = ext4_get_group_desc(sb, 1, &bh);
+        block_bitmap = ext4_block_bitmap(sb, gdp) - 1;
+        for (i = 0; i < sbi->s_groups_count; i++) {
+                gdp = ext4_get_group_desc(sb, i, &bh);
+                flex_group = ext4_flex_group(sbi, i);
+                sbi->s_flex_groups[flex_group].free_inodes +=
+                        le16_to_cpu(gdp->bg_free_inodes_count);
+                sbi->s_flex_groups[flex_group].free_blocks +=
+                        le16_to_cpu(gdp->bg_free_blocks_count);
+        }
+        return 1;
+failed:
+        return 0;
+}
 __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
                            struct ext4_group_desc *gdp)
 {
@@ -1810,8 +1900,8 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
 }
 static int ext4_fill_super (struct super_block *sb, void *data, int silent)
-                                __releases(kernel_sem)
+                                __releases(kernel_lock)
-                                __acquires(kernel_sem)
+                                __acquires(kernel_lock)
 {
        struct buffer_head * bh;
@@ -1851,11 +1941,6 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
                goto out_fail;
        }
-        if (!sb_set_blocksize(sb, blocksize)) {
-                printk(KERN_ERR "EXT4-fs: bad blocksize %d.\n", blocksize);
-                goto out_fail;
-        }
        /*
         * The ext4 superblock will not be buffer aligned for other than 1kB
         * block sizes.  We need to calculate the offset from buffer start.
@@ -1919,15 +2004,28 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
        /*
         * turn on extents feature by default in ext4 filesystem
-         * User -o noextents to turn it off
+         * only if feature flag already set by mkfs or tune2fs.
+         * Use -o noextents to turn it off
         */
-        set_opt(sbi->s_mount_opt, EXTENTS);
+        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
+                set_opt(sbi->s_mount_opt, EXTENTS);
+        else
+                ext4_warning(sb, __func__,
+                        "extents feature not enabled on this filesystem, "
+                        "use tune2fs.\n");
        /*
-         * turn on mballoc feature by default in ext4 filesystem
+         * turn on mballoc code by default in ext4 filesystem
-         * User -o nomballoc to turn it off
+         * Use -o nomballoc to turn it off
         */
        set_opt(sbi->s_mount_opt, MBALLOC);
+        /*
+         * enable delayed allocation by default
+         * Use -o nodelalloc to turn it off
+         */
+        set_opt(sbi->s_mount_opt, DELALLOC);
        if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
                            NULL, 0))
                goto failed_mount;
@@ -2138,6 +2236,14 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
                printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n");
                goto failed_mount2;
        }
+        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+                if (!ext4_fill_flex_info(sb)) {
+                        printk(KERN_ERR
+                               "EXT4-fs: unable to initialize "
+                               "flex_bg meta info!\n");
+                        goto failed_mount2;
+                }
        sbi->s_gdb_count = db_count;
        get_random_bytes(&sbi->s_next_generation, sizeof(u32));
        spin_lock_init(&sbi->s_next_gen_lock);
@@ -2358,6 +2464,13 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
                test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
                "writeback");
+        if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+                printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
+                                "requested data journaling mode\n");
+                clear_opt(sbi->s_mount_opt, DELALLOC);
+        } else if (test_opt(sb, DELALLOC))
+                printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
        ext4_ext_init(sb);
        ext4_mb_init(sb, needs_recovery);
@@ -2372,6 +2485,7 @@ cantfind_ext4:
 failed_mount4:
        jbd2_journal_destroy(sbi->s_journal);
+        sbi->s_journal = NULL;
 failed_mount3:
        percpu_counter_destroy(&sbi->s_freeblocks_counter);
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
@@ -3325,7 +3439,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
                        err = ext4_journal_dirty_metadata(handle, bh);
                else {
                        /* Always do at least ordered writes for quotas */
-                        err = ext4_journal_dirty_data(handle, bh);
+                        err = ext4_jbd2_file_inode(handle, inode);
                        mark_buffer_dirty(bh);
                }
                brelse(bh);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index ff08633f398e..93c5fdcdad2e 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -810,7 +810,7 @@ inserted:
                        /* We need to allocate a new block */
                        ext4_fsblk_t goal = ext4_group_first_block_no(sb,
                                                EXT4_I(inode)->i_block_group);
-                        ext4_fsblk_t block = ext4_new_block(handle, inode,
+                        ext4_fsblk_t block = ext4_new_meta_block(handle, inode,
                                                        goal, &error);
                        if (error)
                                goto cleanup;
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index fff33382cadc..ac1a52cf2a37 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -13,13 +13,11 @@
 #include "ext4.h"
 #include "xattr.h"
-#define XATTR_TRUSTED_PREFIX "trusted."
 static size_t
 ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
                        const char *name, size_t name_len)
 {
-        const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1;
+        const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
        const size_t total_len = prefix_len + name_len + 1;
        if (!capable(CAP_SYS_ADMIN))
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index 67be723fcc4e..d91aa61b42aa 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -12,13 +12,11 @@
 #include "ext4.h"
 #include "xattr.h"
-#define XATTR_USER_PREFIX "user."
 static size_t
 ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size,
                     const char *name, size_t name_len)
 {
-        const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1;
+        const size_t prefix_len = XATTR_USER_PREFIX_LEN;
        const size_t total_len = prefix_len + name_len + 1;
        if (!test_opt(inode->i_sb, XATTR_USER))
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index fda25479af26..3a9ecac8d61f 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -61,7 +61,7 @@ void fat_cache_destroy(void)
 static inline struct fat_cache *fat_cache_alloc(struct inode *inode)
 {
-        return kmem_cache_alloc(fat_cache_cachep, GFP_KERNEL);
+        return kmem_cache_alloc(fat_cache_cachep, GFP_NOFS);
 }
 static inline void fat_cache_free(struct fat_cache *cache)
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 486725ee99ae..34541d06e626 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -472,7 +472,7 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
        loff_t cpos;
        int ret = 0;
-        lock_kernel();
+        lock_super(sb);
        cpos = filp->f_pos;
        /* Fake . and .. for the root directory. */
@@ -654,7 +654,7 @@ FillFailed:
        if (unicode)
                __putname(unicode);
 out:
-        unlock_kernel();
+        unlock_super(sb);
        return ret;
 }
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 771326b8047e..c672df4036e9 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -11,7 +11,6 @@
 #include <linux/mount.h>
 #include <linux/time.h>
 #include <linux/msdos_fs.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
@@ -242,9 +241,7 @@ void fat_truncate(struct inode *inode)
        nr_clusters = (inode->i_size + (cluster_size - 1)) >> sbi->cluster_bits;
-        lock_kernel();
        fat_free(inode, nr_clusters);
-        unlock_kernel();
        fat_flush_inodes(inode->i_sb, inode, NULL);
 }
@@ -310,8 +307,6 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
        int error = 0;
        unsigned int ia_valid;
-        lock_kernel();
        /*
         * Expand the file. Since inode_setattr() updates ->i_size
         * before calling the ->truncate(), but FAT needs to fill the
@@ -366,7 +361,6 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
        error = inode_setattr(inode, attr);
 out:
-        unlock_kernel();
        return error;
 }
 EXPORT_SYMBOL_GPL(fat_setattr);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 4e0a3dd9d677..46a4508ffd2e 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -440,14 +440,13 @@ static void fat_delete_inode(struct inode *inode)
 static void fat_clear_inode(struct inode *inode)
 {
-        struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+        struct super_block *sb = inode->i_sb;
+        struct msdos_sb_info *sbi = MSDOS_SB(sb);
-        lock_kernel();
        spin_lock(&sbi->inode_hash_lock);
        fat_cache_inval_inode(inode);
        hlist_del_init(&MSDOS_I(inode)->i_fat_hash);
        spin_unlock(&sbi->inode_hash_lock);
-        unlock_kernel();
 }
 static void fat_write_super(struct super_block *sb)
@@ -485,7 +484,7 @@ static struct kmem_cache *fat_inode_cachep;
 static struct inode *fat_alloc_inode(struct super_block *sb)
 {
        struct msdos_inode_info *ei;
-        ei = kmem_cache_alloc(fat_inode_cachep, GFP_KERNEL);
+        ei = kmem_cache_alloc(fat_inode_cachep, GFP_NOFS);
        if (!ei)
                return NULL;
        return &ei->vfs_inode;
@@ -567,7 +566,7 @@ retry:
        if (inode->i_ino == MSDOS_ROOT_INO || !i_pos)
                return 0;
-        lock_kernel();
+        lock_super(sb);
        bh = sb_bread(sb, i_pos >> sbi->dir_per_block_bits);
        if (!bh) {
                printk(KERN_ERR "FAT: unable to read inode block "
@@ -579,7 +578,7 @@ retry:
        if (i_pos != MSDOS_I(inode)->i_pos) {
                spin_unlock(&sbi->inode_hash_lock);
                brelse(bh);
-                unlock_kernel();
+                unlock_super(sb);
                goto retry;
        }
@@ -606,7 +605,7 @@ retry:
                err = sync_dirty_buffer(bh);
        brelse(bh);
 out:
-        unlock_kernel();
+        unlock_super(sb);
        return err;
 }
@@ -736,6 +735,7 @@ fat_encode_fh(struct dentry *de, __u32 *fh, int *lenp, int connectable)
 static struct dentry *fat_get_parent(struct dentry *child)
 {
+        struct super_block *sb = child->d_sb;
        struct buffer_head *bh;
        struct msdos_dir_entry *de;
        loff_t i_pos;
@@ -743,14 +743,14 @@ static struct dentry *fat_get_parent(struct dentry *child)
        struct inode *inode;
        int err;
-        lock_kernel();
+        lock_super(sb);
        err = fat_get_dotdot_entry(child->d_inode, &bh, &de, &i_pos);
        if (err) {
                parent = ERR_PTR(err);
                goto out;
        }
-        inode = fat_build_inode(child->d_sb, de, i_pos);
+        inode = fat_build_inode(sb, de, i_pos);
        brelse(bh);
        if (IS_ERR(inode)) {
                parent = ERR_CAST(inode);
@@ -762,7 +762,7 @@ static struct dentry *fat_get_parent(struct dentry *child)
                parent = ERR_PTR(-ENOMEM);
        }
 out:
-        unlock_kernel();
+        unlock_super(sb);
        return parent;
 }
@@ -1172,6 +1172,12 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
        long error;
        char buf[50];
+        /*
+         * GFP_KERNEL is ok here, because while we do hold the
+         * supeblock lock, memory pressure can't call back into
+         * the filesystem, since we're only just about to mount
+         * it and have no inodes etc active!
+         */
        sbi = kzalloc(sizeof(struct msdos_sb_info), GFP_KERNEL);
        if (!sbi)
                return -ENOMEM;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index bfd776509a72..330a7d782591 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -12,7 +12,6 @@
 #include <linux/fdtable.h>
 #include <linux/capability.h>
 #include <linux/dnotify.h>
-#include <linux/smp_lock.h>
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/security.h>
@@ -227,7 +226,6 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
        if (error)
                return error;
-        lock_kernel();
        if ((arg ^ filp->f_flags) & FASYNC) {
                if (filp->f_op && filp->f_op->fasync) {
                        error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0);
@@ -238,7 +236,6 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
        filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
 out:
-        unlock_kernel();
        return error;
 }
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 7f7947e3dfbb..ab2f57e3fb87 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -14,23 +14,11 @@ config GFS2_FS
          GFS is perfect consistency -- changes made to the filesystem on one
          machine show up immediately on all other machines in the cluster.
-          To use the GFS2 filesystem, you will need to enable one or more of
+          To use the GFS2 filesystem in a cluster, you will need to enable
-          the below locking modules. Documentation and utilities for GFS2 can
+          the locking module below. Documentation and utilities for GFS2 can
          be found here: http://sources.redhat.com/cluster
-config GFS2_FS_LOCKING_NOLOCK
+          The "nolock" lock module is now built in to GFS2 by default.
-        tristate "GFS2 \"nolock\" locking module"
-        depends on GFS2_FS
-        help
-          Single node locking module for GFS2.
-          Use this module if you want to use GFS2 on a single node without
-          its clustering features. You can still take advantage of the
-          large file support, and upgrade to running a full cluster later on
-          if required.
-          If you will only be using GFS2 in cluster mode, you do not need this
-          module.
 config GFS2_FS_LOCKING_DLM
        tristate "GFS2 DLM locking module"
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index e2350df02a07..ec65851ec80a 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -5,6 +5,5 @@ gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \
        ops_fstype.o ops_inode.o ops_super.o quota.o \
        recovery.o rgrp.o super.o sys.o trans.o util.o
-obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += locking/nolock/
 obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += locking/dlm/
diff --git a/fs/gfs2/gfs2.h b/fs/gfs2/gfs2.h
index 3bb11c0f8b56..ef606e3a5cf4 100644
--- a/fs/gfs2/gfs2.h
+++ b/fs/gfs2/gfs2.h
@@ -16,11 +16,6 @@ enum {
 };
 enum {
-        NO_WAIT = 0,
-        WAIT = 1,
-};
-enum {
        NO_FORCE = 0,
        FORCE = 1,
 };
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index d636b3e80f5d..13391e546616 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -45,21 +45,19 @@ struct gfs2_gl_hash_bucket {
        struct hlist_head hb_list;
 };
-struct glock_iter {
+struct gfs2_glock_iter {
-        int hash;                     /* hash bucket index         */
+        int hash;                       /* hash bucket index         */
-        struct gfs2_sbd *sdp;         /* incore superblock         */
+        struct gfs2_sbd *sdp;           /* incore superblock         */
-        struct gfs2_glock *gl;        /* current glock struct      */
+        struct gfs2_glock *gl;          /* current glock struct      */
-        struct seq_file *seq;         /* sequence file for debugfs */
+        char string[512];               /* scratch space             */
-        char string[512];             /* scratch space             */
 };
 typedef void (*glock_examiner) (struct gfs2_glock * gl);
 static int gfs2_dump_lockstate(struct gfs2_sbd *sdp);
-static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl);
+static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
-static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh);
+#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0)
-static void gfs2_glock_drop_th(struct gfs2_glock *gl);
+static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target);
-static void run_queue(struct gfs2_glock *gl);
 static DECLARE_RWSEM(gfs2_umount_flush_sem);
 static struct dentry *gfs2_root;
@@ -123,33 +121,6 @@ static inline rwlock_t *gl_lock_addr(unsigned int x)
 #endif
 /**
- * relaxed_state_ok - is a requested lock compatible with the current lock mode?
- * @actual: the current state of the lock
- * @requested: the lock state that was requested by the caller
- * @flags: the modifier flags passed in by the caller
- *
- * Returns: 1 if the locks are compatible, 0 otherwise
- */
-static inline int relaxed_state_ok(unsigned int actual, unsigned requested,
-                                   int flags)
-{
-        if (actual == requested)
-                return 1;
-        if (flags & GL_EXACT)
-                return 0;
-        if (actual == LM_ST_EXCLUSIVE && requested == LM_ST_SHARED)
-                return 1;
-        if (actual != LM_ST_UNLOCKED && (flags & LM_FLAG_ANY))
-                return 1;
-        return 0;
-}
-/**
 * gl_hash() - Turn glock number into hash bucket number
 * @lock: The glock number
 *
@@ -182,7 +153,7 @@ static void glock_free(struct gfs2_glock *gl)
        struct gfs2_sbd *sdp = gl->gl_sbd;
        struct inode *aspace = gl->gl_aspace;
-        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+        if (sdp->sd_lockstruct.ls_ops->lm_put_lock)
                sdp->sd_lockstruct.ls_ops->lm_put_lock(gl->gl_lock);
        if (aspace)
@@ -211,17 +182,14 @@ static void gfs2_glock_hold(struct gfs2_glock *gl)
 int gfs2_glock_put(struct gfs2_glock *gl)
 {
        int rv = 0;
-        struct gfs2_sbd *sdp = gl->gl_sbd;
        write_lock(gl_lock_addr(gl->gl_hash));
        if (atomic_dec_and_test(&gl->gl_ref)) {
                hlist_del(&gl->gl_list);
                write_unlock(gl_lock_addr(gl->gl_hash));
-                gfs2_assert(sdp, gl->gl_state == LM_ST_UNLOCKED);
+                GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_UNLOCKED);
-                gfs2_assert(sdp, list_empty(&gl->gl_reclaim));
+                GLOCK_BUG_ON(gl, !list_empty(&gl->gl_reclaim));
-                gfs2_assert(sdp, list_empty(&gl->gl_holders));
+                GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
-                gfs2_assert(sdp, list_empty(&gl->gl_waiters1));
-                gfs2_assert(sdp, list_empty(&gl->gl_waiters3));
                glock_free(gl);
                rv = 1;
                goto out;
@@ -281,22 +249,401 @@ static struct gfs2_glock *gfs2_glock_find(const struct gfs2_sbd *sdp,
        return gl;
 }
+/**
+ * may_grant - check if its ok to grant a new lock
+ * @gl: The glock
+ * @gh: The lock request which we wish to grant
+ *
+ * Returns: true if its ok to grant the lock
+ */
+static inline int may_grant(const struct gfs2_glock *gl, const struct gfs2_holder *gh)
+{
+        const struct gfs2_holder *gh_head = list_entry(gl->gl_holders.next, const struct gfs2_holder, gh_list);
+        if ((gh->gh_state == LM_ST_EXCLUSIVE ||
+             gh_head->gh_state == LM_ST_EXCLUSIVE) && gh != gh_head)
+                return 0;
+        if (gl->gl_state == gh->gh_state)
+                return 1;
+        if (gh->gh_flags & GL_EXACT)
+                return 0;
+        if (gl->gl_state == LM_ST_EXCLUSIVE) {
+                if (gh->gh_state == LM_ST_SHARED && gh_head->gh_state == LM_ST_SHARED)
+                        return 1;
+                if (gh->gh_state == LM_ST_DEFERRED && gh_head->gh_state == LM_ST_DEFERRED)
+                        return 1;
+        }
+        if (gl->gl_state != LM_ST_UNLOCKED && (gh->gh_flags & LM_FLAG_ANY))
+                return 1;
+        return 0;
+}
+static void gfs2_holder_wake(struct gfs2_holder *gh)
+{
+        clear_bit(HIF_WAIT, &gh->gh_iflags);
+        smp_mb__after_clear_bit();
+        wake_up_bit(&gh->gh_iflags, HIF_WAIT);
+}
+/**
+ * do_promote - promote as many requests as possible on the current queue
+ * @gl: The glock
+ * 
+ * Returns: true if there is a blocked holder at the head of the list
+ */
+static int do_promote(struct gfs2_glock *gl)
+{
+        const struct gfs2_glock_operations *glops = gl->gl_ops;
+        struct gfs2_holder *gh, *tmp;
+        int ret;
+restart:
+        list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
+                if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+                        continue;
+                if (may_grant(gl, gh)) {
+                        if (gh->gh_list.prev == &gl->gl_holders &&
+                            glops->go_lock) {
+                                spin_unlock(&gl->gl_spin);
+                                /* FIXME: eliminate this eventually */
+                                ret = glops->go_lock(gh);
+                                spin_lock(&gl->gl_spin);
+                                if (ret) {
+                                        gh->gh_error = ret;
+                                        list_del_init(&gh->gh_list);
+                                        gfs2_holder_wake(gh);
+                                        goto restart;
+                                }
+                                set_bit(HIF_HOLDER, &gh->gh_iflags);
+                                gfs2_holder_wake(gh);
+                                goto restart;
+                        }
+                        set_bit(HIF_HOLDER, &gh->gh_iflags);
+                        gfs2_holder_wake(gh);
+                        continue;
+                }
+                if (gh->gh_list.prev == &gl->gl_holders)
+                        return 1;
+                break;
+        }
+        return 0;
+}
+/**
+ * do_error - Something unexpected has happened during a lock request
+ *
+ */
+static inline void do_error(struct gfs2_glock *gl, const int ret)
+{
+        struct gfs2_holder *gh, *tmp;
+        list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
+                if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+                        continue;
+                if (ret & LM_OUT_ERROR)
+                        gh->gh_error = -EIO;
+                else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))
+                        gh->gh_error = GLR_TRYFAILED;
+                else
+                        continue;
+                list_del_init(&gh->gh_list);
+                gfs2_holder_wake(gh);
+        }
+}
+/**
+ * find_first_waiter - find the first gh that's waiting for the glock
+ * @gl: the glock
+ */
+static inline struct gfs2_holder *find_first_waiter(const struct gfs2_glock *gl)
+{
+        struct gfs2_holder *gh;
+        list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+                if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
+                        return gh;
+        }
+        return NULL;
+}
+/**
+ * state_change - record that the glock is now in a different state
+ * @gl: the glock
+ * @new_state the new state
+ *
+ */
+static void state_change(struct gfs2_glock *gl, unsigned int new_state)
+{
+        int held1, held2;
+        held1 = (gl->gl_state != LM_ST_UNLOCKED);
+        held2 = (new_state != LM_ST_UNLOCKED);
+        if (held1 != held2) {
+                if (held2)
+                        gfs2_glock_hold(gl);
+                else
+                        gfs2_glock_put(gl);
+        }
+        gl->gl_state = new_state;
+        gl->gl_tchange = jiffies;
+}
+static void gfs2_demote_wake(struct gfs2_glock *gl)
+{
+        gl->gl_demote_state = LM_ST_EXCLUSIVE;
+        clear_bit(GLF_DEMOTE, &gl->gl_flags);
+        smp_mb__after_clear_bit();
+        wake_up_bit(&gl->gl_flags, GLF_DEMOTE);
+}
+/**
+ * finish_xmote - The DLM has replied to one of our lock requests
+ * @gl: The glock
+ * @ret: The status from the DLM
+ *
+ */
+static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
+{
+        const struct gfs2_glock_operations *glops = gl->gl_ops;
+        struct gfs2_holder *gh;
+        unsigned state = ret & LM_OUT_ST_MASK;
+        spin_lock(&gl->gl_spin);
+        state_change(gl, state);
+        gh = find_first_waiter(gl);
+        /* Demote to UN request arrived during demote to SH or DF */
+        if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
+            state != LM_ST_UNLOCKED && gl->gl_demote_state == LM_ST_UNLOCKED)
+                gl->gl_target = LM_ST_UNLOCKED;
+        /* Check for state != intended state */
+        if (unlikely(state != gl->gl_target)) {
+                if (gh && !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) {
+                        /* move to back of queue and try next entry */
+                        if (ret & LM_OUT_CANCELED) {
+                                if ((gh->gh_flags & LM_FLAG_PRIORITY) == 0)
+                                        list_move_tail(&gh->gh_list, &gl->gl_holders);
+                                gh = find_first_waiter(gl);
+                                gl->gl_target = gh->gh_state;
+                                goto retry;
+                        }
+                        /* Some error or failed "try lock" - report it */
+                        if ((ret & LM_OUT_ERROR) ||
+                            (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) {
+                                gl->gl_target = gl->gl_state;
+                                do_error(gl, ret);
+                                goto out;
+                        }
+                }
+                switch(state) {
+                /* Unlocked due to conversion deadlock, try again */
+                case LM_ST_UNLOCKED:
+retry:
+                        do_xmote(gl, gh, gl->gl_target);
+                        break;
+                /* Conversion fails, unlock and try again */
+                case LM_ST_SHARED:
+                case LM_ST_DEFERRED:
+                        do_xmote(gl, gh, LM_ST_UNLOCKED);
+                        break;
+                default: /* Everything else */
+                        printk(KERN_ERR "GFS2: wanted %u got %u\n", gl->gl_target, state);
+                        GLOCK_BUG_ON(gl, 1);
+                }
+                spin_unlock(&gl->gl_spin);
+                gfs2_glock_put(gl);
+                return;
+        }
+        /* Fast path - we got what we asked for */
+        if (test_and_clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags))
+                gfs2_demote_wake(gl);
+        if (state != LM_ST_UNLOCKED) {
+                if (glops->go_xmote_bh) {
+                        int rv;
+                        spin_unlock(&gl->gl_spin);
+                        rv = glops->go_xmote_bh(gl, gh);
+                        if (rv == -EAGAIN)
+                                return;
+                        spin_lock(&gl->gl_spin);
+                        if (rv) {
+                                do_error(gl, rv);
+                                goto out;
+                        }
+                }
+                do_promote(gl);
+        }
+out:
+        clear_bit(GLF_LOCK, &gl->gl_flags);
+        spin_unlock(&gl->gl_spin);
+        gfs2_glock_put(gl);
+}
+static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
+                                 unsigned int cur_state, unsigned int req_state,
+                                 unsigned int flags)
+{
+        int ret = LM_OUT_ERROR;
+        if (!sdp->sd_lockstruct.ls_ops->lm_lock)
+                return req_state == LM_ST_UNLOCKED ? 0 : req_state;
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state,
+                                                         req_state, flags);
+        return ret;
+}
+/**
+ * do_xmote - Calls the DLM to change the state of a lock
+ * @gl: The lock state
+ * @gh: The holder (only for promotes)
+ * @target: The target lock state
+ *
+ */
+static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target)
+{
+        const struct gfs2_glock_operations *glops = gl->gl_ops;
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        unsigned int lck_flags = gh ? gh->gh_flags : 0;
+        int ret;
+        lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
+                      LM_FLAG_PRIORITY);
+        BUG_ON(gl->gl_state == target);
+        BUG_ON(gl->gl_state == gl->gl_target);
+        if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) &&
+            glops->go_inval) {
+                set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
+                do_error(gl, 0); /* Fail queued try locks */
+        }
+        spin_unlock(&gl->gl_spin);
+        if (glops->go_xmote_th)
+                glops->go_xmote_th(gl);
+        if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
+                glops->go_inval(gl, target == LM_ST_DEFERRED ? 0 : DIO_METADATA);
+        clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
+        gfs2_glock_hold(gl);
+        if (target != LM_ST_UNLOCKED && (gl->gl_state == LM_ST_SHARED ||
+            gl->gl_state == LM_ST_DEFERRED) &&
+            !(lck_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
+                lck_flags |= LM_FLAG_TRY_1CB;
+        ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, target, lck_flags);
+        if (!(ret & LM_OUT_ASYNC)) {
+                finish_xmote(gl, ret);
+                gfs2_glock_hold(gl);
+                if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+                        gfs2_glock_put(gl);
+        } else {
+                GLOCK_BUG_ON(gl, ret != LM_OUT_ASYNC);
+        }
+        spin_lock(&gl->gl_spin);
+}
+/**
+ * find_first_holder - find the first "holder" gh
+ * @gl: the glock
+ */
+static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)
+{
+        struct gfs2_holder *gh;
+        if (!list_empty(&gl->gl_holders)) {
+                gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
+                if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+                        return gh;
+        }
+        return NULL;
+}
+/**
+ * run_queue - do all outstanding tasks related to a glock
+ * @gl: The glock in question
+ * @nonblock: True if we must not block in run_queue
+ *
+ */
+static void run_queue(struct gfs2_glock *gl, const int nonblock)
+{
+        struct gfs2_holder *gh = NULL;
+        if (test_and_set_bit(GLF_LOCK, &gl->gl_flags))
+                return;
+        GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags));
+        if (test_bit(GLF_DEMOTE, &gl->gl_flags) &&
+            gl->gl_demote_state != gl->gl_state) {
+                if (find_first_holder(gl))
+                        goto out;
+                if (nonblock)
+                        goto out_sched;
+                set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
+                GLOCK_BUG_ON(gl, gl->gl_demote_state == LM_ST_EXCLUSIVE);
+                gl->gl_target = gl->gl_demote_state;
+        } else {
+                if (test_bit(GLF_DEMOTE, &gl->gl_flags))
+                        gfs2_demote_wake(gl);
+                if (do_promote(gl) == 0)
+                        goto out;
+                gh = find_first_waiter(gl);
+                gl->gl_target = gh->gh_state;
+                if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
+                        do_error(gl, 0); /* Fail queued try locks */
+        }
+        do_xmote(gl, gh, gl->gl_target);
+        return;
+out_sched:
+        gfs2_glock_hold(gl);
+        if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+                gfs2_glock_put(gl);
+out:
+        clear_bit(GLF_LOCK, &gl->gl_flags);
+}
 static void glock_work_func(struct work_struct *work)
 {
+        unsigned long delay = 0;
        struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work);
+        if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags))
+                finish_xmote(gl, gl->gl_reply);
        spin_lock(&gl->gl_spin);
-        if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags))
+        if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
-                set_bit(GLF_DEMOTE, &gl->gl_flags);
+            gl->gl_state != LM_ST_UNLOCKED &&
-        run_queue(gl);
+            gl->gl_demote_state != LM_ST_EXCLUSIVE) {
+                unsigned long holdtime, now = jiffies;
+                holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
+                if (time_before(now, holdtime))
+                        delay = holdtime - now;
+                set_bit(delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE, &gl->gl_flags);
+        }
+        run_queue(gl, 0);
        spin_unlock(&gl->gl_spin);
-        gfs2_glock_put(gl);
+        if (!delay ||
+            queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
+                gfs2_glock_put(gl);
 }
 static int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
                     void **lockp)
 {
        int error = -EIO;
+        if (!sdp->sd_lockstruct.ls_ops->lm_get_lock)
+                return 0;
        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
                error = sdp->sd_lockstruct.ls_ops->lm_get_lock(
                                sdp->sd_lockstruct.ls_lockspace, name, lockp);
@@ -342,12 +689,10 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
        gl->gl_name = name;
        atomic_set(&gl->gl_ref, 1);
        gl->gl_state = LM_ST_UNLOCKED;
+        gl->gl_target = LM_ST_UNLOCKED;
        gl->gl_demote_state = LM_ST_EXCLUSIVE;
        gl->gl_hash = hash;
-        gl->gl_owner_pid = NULL;
-        gl->gl_ip = 0;
        gl->gl_ops = glops;
-        gl->gl_req_gh = NULL;
        gl->gl_stamp = jiffies;
        gl->gl_tchange = jiffies;
        gl->gl_object = NULL;
@@ -447,13 +792,6 @@ void gfs2_holder_uninit(struct gfs2_holder *gh)
        gh->gh_ip = 0;
 }
-static void gfs2_holder_wake(struct gfs2_holder *gh)
-{
-        clear_bit(HIF_WAIT, &gh->gh_iflags);
-        smp_mb__after_clear_bit();
-        wake_up_bit(&gh->gh_iflags, HIF_WAIT);
-}
 static int just_schedule(void *word)
 {
        schedule();
@@ -466,14 +804,6 @@ static void wait_on_holder(struct gfs2_holder *gh)
        wait_on_bit(&gh->gh_iflags, HIF_WAIT, just_schedule, TASK_UNINTERRUPTIBLE);
 }
-static void gfs2_demote_wake(struct gfs2_glock *gl)
-{
-        gl->gl_demote_state = LM_ST_EXCLUSIVE;
-        clear_bit(GLF_DEMOTE, &gl->gl_flags);
-        smp_mb__after_clear_bit();
-        wake_up_bit(&gl->gl_flags, GLF_DEMOTE);
-}
 static void wait_on_demote(struct gfs2_glock *gl)
 {
        might_sleep();
@@ -481,217 +811,6 @@ static void wait_on_demote(struct gfs2_glock *gl)
 }
 /**
- * rq_mutex - process a mutex request in the queue
- * @gh: the glock holder
- *
- * Returns: 1 if the queue is blocked
- */
-static int rq_mutex(struct gfs2_holder *gh)
-{
-        struct gfs2_glock *gl = gh->gh_gl;
-        list_del_init(&gh->gh_list);
-        /*  gh->gh_error never examined.  */
-        set_bit(GLF_LOCK, &gl->gl_flags);
-        clear_bit(HIF_WAIT, &gh->gh_iflags);
-        smp_mb();
-        wake_up_bit(&gh->gh_iflags, HIF_WAIT);
-        return 1;
-}
-/**
- * rq_promote - process a promote request in the queue
- * @gh: the glock holder
- *
- * Acquire a new inter-node lock, or change a lock state to more restrictive.
- *
- * Returns: 1 if the queue is blocked
- */
-static int rq_promote(struct gfs2_holder *gh)
-{
-        struct gfs2_glock *gl = gh->gh_gl;
-        if (!relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
-                if (list_empty(&gl->gl_holders)) {
-                        gl->gl_req_gh = gh;
-                        set_bit(GLF_LOCK, &gl->gl_flags);
-                        spin_unlock(&gl->gl_spin);
-                        gfs2_glock_xmote_th(gh->gh_gl, gh);
-                        spin_lock(&gl->gl_spin);
-                }
-                return 1;
-        }
-        if (list_empty(&gl->gl_holders)) {
-                set_bit(HIF_FIRST, &gh->gh_iflags);
-                set_bit(GLF_LOCK, &gl->gl_flags);
-        } else {
-                struct gfs2_holder *next_gh;
-                if (gh->gh_state == LM_ST_EXCLUSIVE)
-                        return 1;
-                next_gh = list_entry(gl->gl_holders.next, struct gfs2_holder,
-                                     gh_list);
-                if (next_gh->gh_state == LM_ST_EXCLUSIVE)
-                         return 1;
-        }
-        list_move_tail(&gh->gh_list, &gl->gl_holders);
-        gh->gh_error = 0;
-        set_bit(HIF_HOLDER, &gh->gh_iflags);
-        gfs2_holder_wake(gh);
-        return 0;
-}
-/**
- * rq_demote - process a demote request in the queue
- * @gh: the glock holder
- *
- * Returns: 1 if the queue is blocked
- */
-static int rq_demote(struct gfs2_glock *gl)
-{
-        if (!list_empty(&gl->gl_holders))
-                return 1;
-        if (gl->gl_state == gl->gl_demote_state ||
-            gl->gl_state == LM_ST_UNLOCKED) {
-                gfs2_demote_wake(gl);
-                return 0;
-        }
-        set_bit(GLF_LOCK, &gl->gl_flags);
-        set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
-        if (gl->gl_demote_state == LM_ST_UNLOCKED ||
-            gl->gl_state != LM_ST_EXCLUSIVE) {
-                spin_unlock(&gl->gl_spin);
-                gfs2_glock_drop_th(gl);
-        } else {
-                spin_unlock(&gl->gl_spin);
-                gfs2_glock_xmote_th(gl, NULL);
-        }
-        spin_lock(&gl->gl_spin);
-        clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
-        return 0;
-}
-/**
- * run_queue - process holder structures on a glock
- * @gl: the glock
- *
- */
-static void run_queue(struct gfs2_glock *gl)
-{
-        struct gfs2_holder *gh;
-        int blocked = 1;
-        for (;;) {
-                if (test_bit(GLF_LOCK, &gl->gl_flags))
-                        break;
-                if (!list_empty(&gl->gl_waiters1)) {
-                        gh = list_entry(gl->gl_waiters1.next,
-                                        struct gfs2_holder, gh_list);
-                        blocked = rq_mutex(gh);
-                } else if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
-                        blocked = rq_demote(gl);
-                        if (test_bit(GLF_WAITERS2, &gl->gl_flags) &&
-                                     !blocked) {
-                                set_bit(GLF_DEMOTE, &gl->gl_flags);
-                                gl->gl_demote_state = LM_ST_UNLOCKED;
-                        }
-                        clear_bit(GLF_WAITERS2, &gl->gl_flags);
-                } else if (!list_empty(&gl->gl_waiters3)) {
-                        gh = list_entry(gl->gl_waiters3.next,
-                                        struct gfs2_holder, gh_list);
-                        blocked = rq_promote(gh);
-                } else
-                        break;
-                if (blocked)
-                        break;
-        }
-}
-/**
- * gfs2_glmutex_lock - acquire a local lock on a glock
- * @gl: the glock
- *
- * Gives caller exclusive access to manipulate a glock structure.
- */
-static void gfs2_glmutex_lock(struct gfs2_glock *gl)
-{
-        spin_lock(&gl->gl_spin);
-        if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
-                struct gfs2_holder gh;
-                gfs2_holder_init(gl, 0, 0, &gh);
-                set_bit(HIF_WAIT, &gh.gh_iflags);
-                list_add_tail(&gh.gh_list, &gl->gl_waiters1);
-                spin_unlock(&gl->gl_spin);
-                wait_on_holder(&gh);
-                gfs2_holder_uninit(&gh);
-        } else {
-                gl->gl_owner_pid = get_pid(task_pid(current));
-                gl->gl_ip = (unsigned long)__builtin_return_address(0);
-                spin_unlock(&gl->gl_spin);
-        }
-}
-/**
- * gfs2_glmutex_trylock - try to acquire a local lock on a glock
- * @gl: the glock
- *
- * Returns: 1 if the glock is acquired
- */
-static int gfs2_glmutex_trylock(struct gfs2_glock *gl)
-{
-        int acquired = 1;
-        spin_lock(&gl->gl_spin);
-        if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
-                acquired = 0;
-        } else {
-                gl->gl_owner_pid = get_pid(task_pid(current));
-                gl->gl_ip = (unsigned long)__builtin_return_address(0);
-        }
-        spin_unlock(&gl->gl_spin);
-        return acquired;
-}
-/**
- * gfs2_glmutex_unlock - release a local lock on a glock
- * @gl: the glock
- *
- */
-static void gfs2_glmutex_unlock(struct gfs2_glock *gl)
-{
-        struct pid *pid;
-        spin_lock(&gl->gl_spin);
-        clear_bit(GLF_LOCK, &gl->gl_flags);
-        pid = gl->gl_owner_pid;
-        gl->gl_owner_pid = NULL;
-        gl->gl_ip = 0;
-        run_queue(gl);
-        spin_unlock(&gl->gl_spin);
-        put_pid(pid);
-}
-/**
 * handle_callback - process a demote request
 * @gl: the glock
 * @state: the state the caller wants us to change to
@@ -705,398 +824,45 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state,
 {
        int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE;
-        spin_lock(&gl->gl_spin);
        set_bit(bit, &gl->gl_flags);
        if (gl->gl_demote_state == LM_ST_EXCLUSIVE) {
                gl->gl_demote_state = state;
                gl->gl_demote_time = jiffies;
                if (remote && gl->gl_ops->go_type == LM_TYPE_IOPEN &&
-                    gl->gl_object) {
+                    gl->gl_object)
                        gfs2_glock_schedule_for_reclaim(gl);
-                        spin_unlock(&gl->gl_spin);
-                        return;
-                }
        } else if (gl->gl_demote_state != LM_ST_UNLOCKED &&
                        gl->gl_demote_state != state) {
-                if (test_bit(GLF_DEMOTE_IN_PROGRESS,  &gl->gl_flags)) 
+                gl->gl_demote_state = LM_ST_UNLOCKED;
-                        set_bit(GLF_WAITERS2, &gl->gl_flags);
-                else 
-                        gl->gl_demote_state = LM_ST_UNLOCKED;
-        }
-        spin_unlock(&gl->gl_spin);
-}
-/**
- * state_change - record that the glock is now in a different state
- * @gl: the glock
- * @new_state the new state
- *
- */
-static void state_change(struct gfs2_glock *gl, unsigned int new_state)
-{
-        int held1, held2;
-        held1 = (gl->gl_state != LM_ST_UNLOCKED);
-        held2 = (new_state != LM_ST_UNLOCKED);
-        if (held1 != held2) {
-                if (held2)
-                        gfs2_glock_hold(gl);
-                else
-                        gfs2_glock_put(gl);
        }
-        gl->gl_state = new_state;
-        gl->gl_tchange = jiffies;
 }
 /**
- * drop_bh - Called after a lock module unlock completes
+ * gfs2_glock_wait - wait on a glock acquisition
- * @gl: the glock
- * @ret: the return status
- *
- * Doesn't wake up the process waiting on the struct gfs2_holder (if any)
- * Doesn't drop the reference on the glock the top half took out
- *
- */
-static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
-{
-        struct gfs2_sbd *sdp = gl->gl_sbd;
-        struct gfs2_holder *gh = gl->gl_req_gh;
-        gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
-        gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
-        gfs2_assert_warn(sdp, !ret);
-        state_change(gl, LM_ST_UNLOCKED);
-        if (test_and_clear_bit(GLF_CONV_DEADLK, &gl->gl_flags)) {
-                spin_lock(&gl->gl_spin);
-                gh->gh_error = 0;
-                spin_unlock(&gl->gl_spin);
-                gfs2_glock_xmote_th(gl, gl->gl_req_gh);
-                gfs2_glock_put(gl);
-                return;
-        }
-        spin_lock(&gl->gl_spin);
-        gfs2_demote_wake(gl);
-        clear_bit(GLF_LOCK, &gl->gl_flags);
-        spin_unlock(&gl->gl_spin);
-        gfs2_glock_put(gl);
-}
-/**
- * xmote_bh - Called after the lock module is done acquiring a lock
- * @gl: The glock in question
- * @ret: the int returned from the lock module
- *
- */
-static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
-{
-        struct gfs2_sbd *sdp = gl->gl_sbd;
-        const struct gfs2_glock_operations *glops = gl->gl_ops;
-        struct gfs2_holder *gh = gl->gl_req_gh;
-        int op_done = 1;
-        if (!gh && (ret & LM_OUT_ST_MASK) == LM_ST_UNLOCKED) {
-                drop_bh(gl, ret);
-                return;
-        }
-        gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
-        gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
-        gfs2_assert_warn(sdp, !(ret & LM_OUT_ASYNC));
-        state_change(gl, ret & LM_OUT_ST_MASK);
-        /*  Deal with each possible exit condition  */
-        if (!gh) {
-                gl->gl_stamp = jiffies;
-                if (ret & LM_OUT_CANCELED) {
-                        op_done = 0;
-                } else {
-                        spin_lock(&gl->gl_spin);
-                        if (gl->gl_state != gl->gl_demote_state) {
-                                spin_unlock(&gl->gl_spin);
-                                gfs2_glock_drop_th(gl);
-                                gfs2_glock_put(gl);
-                                return;
-                        }
-                        gfs2_demote_wake(gl);
-                        spin_unlock(&gl->gl_spin);
-                }
-        } else {
-                spin_lock(&gl->gl_spin);
-                if (ret & LM_OUT_CONV_DEADLK) {
-                        gh->gh_error = 0;
-                        set_bit(GLF_CONV_DEADLK, &gl->gl_flags);
-                        spin_unlock(&gl->gl_spin);
-                        gfs2_glock_drop_th(gl);
-                        gfs2_glock_put(gl);
-                        return;
-                }
-                list_del_init(&gh->gh_list);
-                gh->gh_error = -EIO;
-                if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) 
-                        goto out;
-                gh->gh_error = GLR_CANCELED;
-                if (ret & LM_OUT_CANCELED) 
-                        goto out;
-                if (relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
-                        list_add_tail(&gh->gh_list, &gl->gl_holders);
-                        gh->gh_error = 0;
-                        set_bit(HIF_HOLDER, &gh->gh_iflags);
-                        set_bit(HIF_FIRST, &gh->gh_iflags);
-                        op_done = 0;
-                        goto out;
-                }
-                gh->gh_error = GLR_TRYFAILED;
-                if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))
-                        goto out;
-                gh->gh_error = -EINVAL;
-                if (gfs2_assert_withdraw(sdp, 0) == -1)
-                        fs_err(sdp, "ret = 0x%.8X\n", ret);
-out:
-                spin_unlock(&gl->gl_spin);
-        }
-        if (glops->go_xmote_bh)
-                glops->go_xmote_bh(gl);
-        if (op_done) {
-                spin_lock(&gl->gl_spin);
-                gl->gl_req_gh = NULL;
-                clear_bit(GLF_LOCK, &gl->gl_flags);
-                spin_unlock(&gl->gl_spin);
-        }
-        gfs2_glock_put(gl);
-        if (gh)
-                gfs2_holder_wake(gh);
-}
-static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
-                                 unsigned int cur_state, unsigned int req_state,
-                                 unsigned int flags)
-{
-        int ret = 0;
-        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-                ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state,
-                                                         req_state, flags);
-        return ret;
-}
-/**
- * gfs2_glock_xmote_th - Call into the lock module to acquire or change a glock
- * @gl: The glock in question
- * @state: the requested state
- * @flags: modifier flags to the lock call
- *
- */
-static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh)
-{
-        struct gfs2_sbd *sdp = gl->gl_sbd;
-        int flags = gh ? gh->gh_flags : 0;
-        unsigned state = gh ? gh->gh_state : gl->gl_demote_state;
-        const struct gfs2_glock_operations *glops = gl->gl_ops;
-        int lck_flags = flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB |
-                                 LM_FLAG_NOEXP | LM_FLAG_ANY |
-                                 LM_FLAG_PRIORITY);
-        unsigned int lck_ret;
-        if (glops->go_xmote_th)
-                glops->go_xmote_th(gl);
-        if (state == LM_ST_DEFERRED && glops->go_inval)
-                glops->go_inval(gl, DIO_METADATA);
-        gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
-        gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
-        gfs2_assert_warn(sdp, state != LM_ST_UNLOCKED);
-        gfs2_assert_warn(sdp, state != gl->gl_state);
-        gfs2_glock_hold(gl);
-        lck_ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, state, lck_flags);
-        if (gfs2_assert_withdraw(sdp, !(lck_ret & LM_OUT_ERROR)))
-                return;
-        if (lck_ret & LM_OUT_ASYNC)
-                gfs2_assert_warn(sdp, lck_ret == LM_OUT_ASYNC);
-        else
-                xmote_bh(gl, lck_ret);
-}
-static unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, void *lock,
-                                   unsigned int cur_state)
-{
-        int ret = 0;
-        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-                ret =  sdp->sd_lockstruct.ls_ops->lm_unlock(lock, cur_state);
-        return ret;
-}
-/**
- * gfs2_glock_drop_th - call into the lock module to unlock a lock
- * @gl: the glock
- *
- */
-static void gfs2_glock_drop_th(struct gfs2_glock *gl)
-{
-        struct gfs2_sbd *sdp = gl->gl_sbd;
-        const struct gfs2_glock_operations *glops = gl->gl_ops;
-        unsigned int ret;
-        if (glops->go_xmote_th)
-                glops->go_xmote_th(gl);
-        if (glops->go_inval)
-                glops->go_inval(gl, DIO_METADATA);
-        gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
-        gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
-        gfs2_assert_warn(sdp, gl->gl_state != LM_ST_UNLOCKED);
-        gfs2_glock_hold(gl);
-        ret = gfs2_lm_unlock(sdp, gl->gl_lock, gl->gl_state);
-        if (gfs2_assert_withdraw(sdp, !(ret & LM_OUT_ERROR)))
-                return;
-        if (!ret)
-                drop_bh(gl, ret);
-        else
-                gfs2_assert_warn(sdp, ret == LM_OUT_ASYNC);
-}
-/**
- * do_cancels - cancel requests for locks stuck waiting on an expire flag
- * @gh: the LM_FLAG_PRIORITY holder waiting to acquire the lock
- *
- * Don't cancel GL_NOCANCEL requests.
- */
-static void do_cancels(struct gfs2_holder *gh)
-{
-        struct gfs2_glock *gl = gh->gh_gl;
-        struct gfs2_sbd *sdp = gl->gl_sbd;
-        spin_lock(&gl->gl_spin);
-        while (gl->gl_req_gh != gh &&
-               !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
-               !list_empty(&gh->gh_list)) {
-                if (!(gl->gl_req_gh && (gl->gl_req_gh->gh_flags & GL_NOCANCEL))) {
-                        spin_unlock(&gl->gl_spin);
-                        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-                                sdp->sd_lockstruct.ls_ops->lm_cancel(gl->gl_lock);
-                        msleep(100);
-                        spin_lock(&gl->gl_spin);
-                } else {
-                        spin_unlock(&gl->gl_spin);
-                        msleep(100);
-                        spin_lock(&gl->gl_spin);
-                }
-        }
-        spin_unlock(&gl->gl_spin);
-}
-/**
- * glock_wait_internal - wait on a glock acquisition
 * @gh: the glock holder
 *
 * Returns: 0 on success
 */
-static int glock_wait_internal(struct gfs2_holder *gh)
+int gfs2_glock_wait(struct gfs2_holder *gh)
 {
-        struct gfs2_glock *gl = gh->gh_gl;
-        struct gfs2_sbd *sdp = gl->gl_sbd;
-        const struct gfs2_glock_operations *glops = gl->gl_ops;
-        if (test_bit(HIF_ABORTED, &gh->gh_iflags))
-                return -EIO;
-        if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
-                spin_lock(&gl->gl_spin);
-                if (gl->gl_req_gh != gh &&
-                    !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
-                    !list_empty(&gh->gh_list)) {
-                        list_del_init(&gh->gh_list);
-                        gh->gh_error = GLR_TRYFAILED;
-                        run_queue(gl);
-                        spin_unlock(&gl->gl_spin);
-                        return gh->gh_error;
-                }
-                spin_unlock(&gl->gl_spin);
-        }
-        if (gh->gh_flags & LM_FLAG_PRIORITY)
-                do_cancels(gh);
        wait_on_holder(gh);
-        if (gh->gh_error)
-                return gh->gh_error;
-        gfs2_assert_withdraw(sdp, test_bit(HIF_HOLDER, &gh->gh_iflags));
-        gfs2_assert_withdraw(sdp, relaxed_state_ok(gl->gl_state, gh->gh_state,
-                                                   gh->gh_flags));
-        if (test_bit(HIF_FIRST, &gh->gh_iflags)) {
-                gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
-                if (glops->go_lock) {
-                        gh->gh_error = glops->go_lock(gh);
-                        if (gh->gh_error) {
-                                spin_lock(&gl->gl_spin);
-                                list_del_init(&gh->gh_list);
-                                spin_unlock(&gl->gl_spin);
-                        }
-                }
-                spin_lock(&gl->gl_spin);
-                gl->gl_req_gh = NULL;
-                clear_bit(GLF_LOCK, &gl->gl_flags);
-                run_queue(gl);
-                spin_unlock(&gl->gl_spin);
-        }
        return gh->gh_error;
 }
-static inline struct gfs2_holder *
+void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
-find_holder_by_owner(struct list_head *head, struct pid *pid)
-{
-        struct gfs2_holder *gh;
-        list_for_each_entry(gh, head, gh_list) {
-                if (gh->gh_owner_pid == pid)
-                        return gh;
-        }
-        return NULL;
-}
-static void print_dbg(struct glock_iter *gi, const char *fmt, ...)
 {
        va_list args;
        va_start(args, fmt);
-        if (gi) {
+        if (seq) {
+                struct gfs2_glock_iter *gi = seq->private;
                vsprintf(gi->string, fmt, args);
-                seq_printf(gi->seq, gi->string);
+                seq_printf(seq, gi->string);
-        }
+        } else {
-        else
+                printk(KERN_ERR " ");
                vprintk(fmt, args);
+        }
        va_end(args);
 }
@@ -1104,50 +870,76 @@ static void print_dbg(struct glock_iter *gi, const char *fmt, ...)
 * add_to_queue - Add a holder to the wait queue (but look for recursion)
 * @gh: the holder structure to add
 *
+ * Eventually we should move the recursive locking trap to a
+ * debugging option or something like that. This is the fast
+ * path and needs to have the minimum number of distractions.
+ * 
 */
-static void add_to_queue(struct gfs2_holder *gh)
+static inline void add_to_queue(struct gfs2_holder *gh)
 {
        struct gfs2_glock *gl = gh->gh_gl;
-        struct gfs2_holder *existing;
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        struct list_head *insert_pt = NULL;
+        struct gfs2_holder *gh2;
+        int try_lock = 0;
        BUG_ON(gh->gh_owner_pid == NULL);
        if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags))
                BUG();
-        if (!(gh->gh_flags & GL_FLOCK)) {
+        if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
-                existing = find_holder_by_owner(&gl->gl_holders, 
+                if (test_bit(GLF_LOCK, &gl->gl_flags))
-                                                gh->gh_owner_pid);
+                        try_lock = 1;
-                if (existing) {
+                if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
-                        print_symbol(KERN_WARNING "original: %s\n", 
+                        goto fail;
-                                     existing->gh_ip);
+        }
-                        printk(KERN_INFO "pid : %d\n",
-                                        pid_nr(existing->gh_owner_pid));
+        list_for_each_entry(gh2, &gl->gl_holders, gh_list) {
-                        printk(KERN_INFO "lock type : %d lock state : %d\n",
+                if (unlikely(gh2->gh_owner_pid == gh->gh_owner_pid &&
-                               existing->gh_gl->gl_name.ln_type, 
+                    (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK)))
-                               existing->gh_gl->gl_state);
+                        goto trap_recursive;
-                        print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
+                if (try_lock &&
-                        printk(KERN_INFO "pid : %d\n",
+                    !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) &&
-                                        pid_nr(gh->gh_owner_pid));
+                    !may_grant(gl, gh)) {
-                        printk(KERN_INFO "lock type : %d lock state : %d\n",
+fail:
-                               gl->gl_name.ln_type, gl->gl_state);
+                        gh->gh_error = GLR_TRYFAILED;
-                        BUG();
+                        gfs2_holder_wake(gh);
-                }
+                        return;
-                
-                existing = find_holder_by_owner(&gl->gl_waiters3, 
-                                                gh->gh_owner_pid);
-                if (existing) {
-                        print_symbol(KERN_WARNING "original: %s\n", 
-                                     existing->gh_ip);
-                        print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
-                        BUG();
                }
+                if (test_bit(HIF_HOLDER, &gh2->gh_iflags))
+                        continue;
+                if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt))
+                        insert_pt = &gh2->gh_list;
+        }
+        if (likely(insert_pt == NULL)) {
+                list_add_tail(&gh->gh_list, &gl->gl_holders);
+                if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY))
+                        goto do_cancel;
+                return;
+        }
+        list_add_tail(&gh->gh_list, insert_pt);
+do_cancel:
+        gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
+        if (!(gh->gh_flags & LM_FLAG_PRIORITY)) {
+                spin_unlock(&gl->gl_spin);
+                if (sdp->sd_lockstruct.ls_ops->lm_cancel)
+                        sdp->sd_lockstruct.ls_ops->lm_cancel(gl->gl_lock);
+                spin_lock(&gl->gl_spin);
        }
+        return;
-        if (gh->gh_flags & LM_FLAG_PRIORITY)
+trap_recursive:
-                list_add(&gh->gh_list, &gl->gl_waiters3);
+        print_symbol(KERN_ERR "original: %s\n", gh2->gh_ip);
-        else
+        printk(KERN_ERR "pid: %d\n", pid_nr(gh2->gh_owner_pid));
-                list_add_tail(&gh->gh_list, &gl->gl_waiters3);
+        printk(KERN_ERR "lock type: %d req lock state : %d\n",
+               gh2->gh_gl->gl_name.ln_type, gh2->gh_state);
+        print_symbol(KERN_ERR "new: %s\n", gh->gh_ip);
+        printk(KERN_ERR "pid: %d\n", pid_nr(gh->gh_owner_pid));
+        printk(KERN_ERR "lock type: %d req lock state : %d\n",
+               gh->gh_gl->gl_name.ln_type, gh->gh_state);
+        __dump_glock(NULL, gl);
+        BUG();
 }
 /**
@@ -1165,24 +957,16 @@ int gfs2_glock_nq(struct gfs2_holder *gh)
        struct gfs2_sbd *sdp = gl->gl_sbd;
        int error = 0;
-restart:
+        if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-        if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
-                set_bit(HIF_ABORTED, &gh->gh_iflags);
                return -EIO;
-        }
        spin_lock(&gl->gl_spin);
        add_to_queue(gh);
-        run_queue(gl);
+        run_queue(gl, 1);
        spin_unlock(&gl->gl_spin);
-        if (!(gh->gh_flags & GL_ASYNC)) {
+        if (!(gh->gh_flags & GL_ASYNC))
-                error = glock_wait_internal(gh);
+                error = gfs2_glock_wait(gh);
-                if (error == GLR_CANCELED) {
-                        msleep(100);
-                        goto restart;
-                }
-        }
        return error;
 }
@@ -1196,48 +980,7 @@ restart:
 int gfs2_glock_poll(struct gfs2_holder *gh)
 {
-        struct gfs2_glock *gl = gh->gh_gl;
+        return test_bit(HIF_WAIT, &gh->gh_iflags) ? 0 : 1;
-        int ready = 0;
-        spin_lock(&gl->gl_spin);
-        if (test_bit(HIF_HOLDER, &gh->gh_iflags))
-                ready = 1;
-        else if (list_empty(&gh->gh_list)) {
-                if (gh->gh_error == GLR_CANCELED) {
-                        spin_unlock(&gl->gl_spin);
-                        msleep(100);
-                        if (gfs2_glock_nq(gh))
-                                return 1;
-                        return 0;
-                } else
-                        ready = 1;
-        }
-        spin_unlock(&gl->gl_spin);
-        return ready;
-}
-/**
- * gfs2_glock_wait - wait for a lock acquisition that ended in a GLR_ASYNC
- * @gh: the holder structure
- *
- * Returns: 0, GLR_TRYFAILED, or errno on failure
- */
-int gfs2_glock_wait(struct gfs2_holder *gh)
-{
-        int error;
-        error = glock_wait_internal(gh);
-        if (error == GLR_CANCELED) {
-                msleep(100);
-                gh->gh_flags &= ~GL_ASYNC;
-                error = gfs2_glock_nq(gh);
-        }
-        return error;
 }
 /**
@@ -1251,26 +994,30 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
        struct gfs2_glock *gl = gh->gh_gl;
        const struct gfs2_glock_operations *glops = gl->gl_ops;
        unsigned delay = 0;
+        int fast_path = 0;
+        spin_lock(&gl->gl_spin);
        if (gh->gh_flags & GL_NOCACHE)
                handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
-        gfs2_glmutex_lock(gl);
-        spin_lock(&gl->gl_spin);
        list_del_init(&gh->gh_list);
+        if (find_first_holder(gl) == NULL) {
-        if (list_empty(&gl->gl_holders)) {
                if (glops->go_unlock) {
+                        GLOCK_BUG_ON(gl, test_and_set_bit(GLF_LOCK, &gl->gl_flags));
                        spin_unlock(&gl->gl_spin);
                        glops->go_unlock(gh);
                        spin_lock(&gl->gl_spin);
+                        clear_bit(GLF_LOCK, &gl->gl_flags);
                }
                gl->gl_stamp = jiffies;
+                if (list_empty(&gl->gl_holders) &&
+                    !test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
+                    !test_bit(GLF_DEMOTE, &gl->gl_flags))
+                        fast_path = 1;
        }
-        clear_bit(GLF_LOCK, &gl->gl_flags);
        spin_unlock(&gl->gl_spin);
+        if (likely(fast_path))
+                return;
        gfs2_glock_hold(gl);
        if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
@@ -1454,6 +1201,8 @@ void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
 static int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp)
 {
        int error = -EIO;
+        if (!sdp->sd_lockstruct.ls_ops->lm_hold_lvb)
+                return 0;
        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
                error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp);
        return error;
@@ -1469,20 +1218,14 @@ int gfs2_lvb_hold(struct gfs2_glock *gl)
 {
        int error;
-        gfs2_glmutex_lock(gl);
        if (!atomic_read(&gl->gl_lvb_count)) {
                error = gfs2_lm_hold_lvb(gl->gl_sbd, gl->gl_lock, &gl->gl_lvb);
-                if (error) {
+                if (error) 
-                        gfs2_glmutex_unlock(gl);
                        return error;
-                }
                gfs2_glock_hold(gl);
        }
        atomic_inc(&gl->gl_lvb_count);
-        gfs2_glmutex_unlock(gl);
        return 0;
 }
@@ -1497,17 +1240,13 @@ void gfs2_lvb_unhold(struct gfs2_glock *gl)
        struct gfs2_sbd *sdp = gl->gl_sbd;
        gfs2_glock_hold(gl);
-        gfs2_glmutex_lock(gl);
        gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count) > 0);
        if (atomic_dec_and_test(&gl->gl_lvb_count)) {
-                if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                if (sdp->sd_lockstruct.ls_ops->lm_unhold_lvb)
                        sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(gl->gl_lock, gl->gl_lvb);
                gl->gl_lvb = NULL;
                gfs2_glock_put(gl);
        }
-        gfs2_glmutex_unlock(gl);
        gfs2_glock_put(gl);
 }
@@ -1527,7 +1266,9 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
        if (time_before(now, holdtime))
                delay = holdtime - now;
+        spin_lock(&gl->gl_spin);
        handle_callback(gl, state, 1, delay);
+        spin_unlock(&gl->gl_spin);
        if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
                gfs2_glock_put(gl);
 }
@@ -1568,7 +1309,8 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
                gl = gfs2_glock_find(sdp, &async->lc_name);
                if (gfs2_assert_warn(sdp, gl))
                        return;
-                xmote_bh(gl, async->lc_ret);
+                gl->gl_reply = async->lc_ret;
+                set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
                if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
                        gfs2_glock_put(gl);
                up_read(&gfs2_umount_flush_sem);
@@ -1581,11 +1323,6 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
                        wake_up_process(sdp->sd_recoverd_process);
                return;
-        case LM_CB_DROPLOCKS:
-                gfs2_gl_hash_clear(sdp, NO_WAIT);
-                gfs2_quota_scan(sdp);
-                return;
        default:
                gfs2_assert_warn(sdp, 0);
                return;
@@ -1646,6 +1383,7 @@ void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
 void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
 {
        struct gfs2_glock *gl;
+        int done_callback = 0;
        spin_lock(&sdp->sd_reclaim_lock);
        if (list_empty(&sdp->sd_reclaim_list)) {
@@ -1660,14 +1398,16 @@ void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
        atomic_dec(&sdp->sd_reclaim_count);
        atomic_inc(&sdp->sd_reclaimed);
-        if (gfs2_glmutex_trylock(gl)) {
+        spin_lock(&gl->gl_spin);
-                if (list_empty(&gl->gl_holders) &&
+        if (find_first_holder(gl) == NULL &&
-                    gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
+            gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) {
-                        handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
+                handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
-                gfs2_glmutex_unlock(gl);
+                done_callback = 1;
        }
+        spin_unlock(&gl->gl_spin);
-        gfs2_glock_put(gl);
+        if (!done_callback ||
+            queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+                gfs2_glock_put(gl);
 }
 /**
@@ -1724,18 +1464,14 @@ static void scan_glock(struct gfs2_glock *gl)
 {
        if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object)
                return;
+        if (test_bit(GLF_LOCK, &gl->gl_flags))
+                return;
-        if (gfs2_glmutex_trylock(gl)) {
+        spin_lock(&gl->gl_spin);
-                if (list_empty(&gl->gl_holders) &&
+        if (find_first_holder(gl) == NULL &&
-                    gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
+            gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
-                        goto out_schedule;
+                gfs2_glock_schedule_for_reclaim(gl);
-                gfs2_glmutex_unlock(gl);
+        spin_unlock(&gl->gl_spin);
-        }
-        return;
-out_schedule:
-        gfs2_glmutex_unlock(gl);
-        gfs2_glock_schedule_for_reclaim(gl);
 }
 /**
@@ -1760,12 +1496,13 @@ static void clear_glock(struct gfs2_glock *gl)
                spin_unlock(&sdp->sd_reclaim_lock);
        }
-        if (gfs2_glmutex_trylock(gl)) {
+        spin_lock(&gl->gl_spin);
-                if (list_empty(&gl->gl_holders) &&
+        if (find_first_holder(gl) == NULL && gl->gl_state != LM_ST_UNLOCKED)
-                    gl->gl_state != LM_ST_UNLOCKED)
+                handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
-                        handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
+        spin_unlock(&gl->gl_spin);
-                gfs2_glmutex_unlock(gl);
+        gfs2_glock_hold(gl);
-        }
+        if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+                gfs2_glock_put(gl);
 }
 /**
@@ -1773,11 +1510,10 @@ static void clear_glock(struct gfs2_glock *gl)
 * @sdp: the filesystem
 * @wait: wait until it's all gone
 *
- * Called when unmounting the filesystem, or when inter-node lock manager
+ * Called when unmounting the filesystem.
- * requests DROPLOCKS because it is running out of capacity.
 */
-void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
+void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
 {
        unsigned long t;
        unsigned int x;
@@ -1792,7 +1528,7 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
                                cont = 1;
                }
-                if (!wait || !cont)
+                if (!cont)
                        break;
                if (time_after_eq(jiffies,
@@ -1810,180 +1546,164 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
        }
 }
-/*
+static const char *state2str(unsigned state)
- *  Diagnostic routines to help debug distributed deadlock
- */
-static void gfs2_print_symbol(struct glock_iter *gi, const char *fmt,
-                              unsigned long address)
 {
-        char buffer[KSYM_SYMBOL_LEN];
+        switch(state) {
+        case LM_ST_UNLOCKED:
-        sprint_symbol(buffer, address);
+                return "UN";
-        print_dbg(gi, fmt, buffer);
+        case LM_ST_SHARED:
+                return "SH";
+        case LM_ST_DEFERRED:
+                return "DF";
+        case LM_ST_EXCLUSIVE:
+                return "EX";
+        }
+        return "??";
+}
+static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
+{
+        char *p = buf;
+        if (flags & LM_FLAG_TRY)
+                *p++ = 't';
+        if (flags & LM_FLAG_TRY_1CB)
+                *p++ = 'T';
+        if (flags & LM_FLAG_NOEXP)
+                *p++ = 'e';
+        if (flags & LM_FLAG_ANY)
+                *p++ = 'a';
+        if (flags & LM_FLAG_PRIORITY)
+                *p++ = 'p';
+        if (flags & GL_ASYNC)
+                *p++ = 'a';
+        if (flags & GL_EXACT)
+                *p++ = 'E';
+        if (flags & GL_ATIME)
+                *p++ = 'a';
+        if (flags & GL_NOCACHE)
+                *p++ = 'c';
+        if (test_bit(HIF_HOLDER, &iflags))
+                *p++ = 'H';
+        if (test_bit(HIF_WAIT, &iflags))
+                *p++ = 'W';
+        if (test_bit(HIF_FIRST, &iflags))
+                *p++ = 'F';
+        *p = 0;
+        return buf;
 }
 /**
 * dump_holder - print information about a glock holder
- * @str: a string naming the type of holder
+ * @seq: the seq_file struct
 * @gh: the glock holder
 *
 * Returns: 0 on success, -ENOBUFS when we run out of space
 */
-static int dump_holder(struct glock_iter *gi, char *str,
+static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
-                       struct gfs2_holder *gh)
 {
-        unsigned int x;
+        struct task_struct *gh_owner = NULL;
-        struct task_struct *gh_owner;
+        char buffer[KSYM_SYMBOL_LEN];
+        char flags_buf[32];
-        print_dbg(gi, "  %s\n", str);
+        sprint_symbol(buffer, gh->gh_ip);
-        if (gh->gh_owner_pid) {
+        if (gh->gh_owner_pid)
-                print_dbg(gi, "    owner = %ld ",
-                                (long)pid_nr(gh->gh_owner_pid));
                gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID);
-                if (gh_owner)
+        gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %s\n",
-                        print_dbg(gi, "(%s)\n", gh_owner->comm);
+                  state2str(gh->gh_state),
-                else
+                  hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags),
-                        print_dbg(gi, "(ended)\n");
+                  gh->gh_error, 
-        } else
+                  gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
-                print_dbg(gi, "    owner = -1\n");
+                  gh_owner ? gh_owner->comm : "(ended)", buffer);
-        print_dbg(gi, "    gh_state = %u\n", gh->gh_state);
-        print_dbg(gi, "    gh_flags =");
-        for (x = 0; x < 32; x++)
-                if (gh->gh_flags & (1 << x))
-                        print_dbg(gi, " %u", x);
-        print_dbg(gi, " \n");
-        print_dbg(gi, "    error = %d\n", gh->gh_error);
-        print_dbg(gi, "    gh_iflags =");
-        for (x = 0; x < 32; x++)
-                if (test_bit(x, &gh->gh_iflags))
-                        print_dbg(gi, " %u", x);
-        print_dbg(gi, " \n");
-        gfs2_print_symbol(gi, "    initialized at: %s\n", gh->gh_ip);
        return 0;
 }
-/**
+static const char *gflags2str(char *buf, const unsigned long *gflags)
- * dump_inode - print information about an inode
+{
- * @ip: the inode
+        char *p = buf;
- *
+        if (test_bit(GLF_LOCK, gflags))
- * Returns: 0 on success, -ENOBUFS when we run out of space
+                *p++ = 'l';
- */
+        if (test_bit(GLF_STICKY, gflags))
+                *p++ = 's';
-static int dump_inode(struct glock_iter *gi, struct gfs2_inode *ip)
+        if (test_bit(GLF_DEMOTE, gflags))
-{
+                *p++ = 'D';
-        unsigned int x;
+        if (test_bit(GLF_PENDING_DEMOTE, gflags))
+                *p++ = 'd';
-        print_dbg(gi, "  Inode:\n");
+        if (test_bit(GLF_DEMOTE_IN_PROGRESS, gflags))
-        print_dbg(gi, "    num = %llu/%llu\n",
+                *p++ = 'p';
-                  (unsigned long long)ip->i_no_formal_ino,
+        if (test_bit(GLF_DIRTY, gflags))
-                  (unsigned long long)ip->i_no_addr);
+                *p++ = 'y';
-        print_dbg(gi, "    type = %u\n", IF2DT(ip->i_inode.i_mode));
+        if (test_bit(GLF_LFLUSH, gflags))
-        print_dbg(gi, "    i_flags =");
+                *p++ = 'f';
-        for (x = 0; x < 32; x++)
+        if (test_bit(GLF_INVALIDATE_IN_PROGRESS, gflags))
-                if (test_bit(x, &ip->i_flags))
+                *p++ = 'i';
-                        print_dbg(gi, " %u", x);
+        if (test_bit(GLF_REPLY_PENDING, gflags))
-        print_dbg(gi, " \n");
+                *p++ = 'r';
-        return 0;
+        *p = 0;
+        return buf;
 }
 /**
- * dump_glock - print information about a glock
+ * __dump_glock - print information about a glock
+ * @seq: The seq_file struct
 * @gl: the glock
- * @count: where we are in the buffer
+ *
+ * The file format is as follows:
+ * One line per object, capital letters are used to indicate objects
+ * G = glock, I = Inode, R = rgrp, H = holder. Glocks are not indented,
+ * other objects are indented by a single space and follow the glock to
+ * which they are related. Fields are indicated by lower case letters
+ * followed by a colon and the field value, except for strings which are in
+ * [] so that its possible to see if they are composed of spaces for
+ * example. The field's are n = number (id of the object), f = flags,
+ * t = type, s = state, r = refcount, e = error, p = pid.
 *
 * Returns: 0 on success, -ENOBUFS when we run out of space
 */
-static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl)
+static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
 {
-        struct gfs2_holder *gh;
+        const struct gfs2_glock_operations *glops = gl->gl_ops;
-        unsigned int x;
+        unsigned long long dtime;
-        int error = -ENOBUFS;
+        const struct gfs2_holder *gh;
-        struct task_struct *gl_owner;
+        char gflags_buf[32];
+        int error = 0;
-        spin_lock(&gl->gl_spin);
+        dtime = jiffies - gl->gl_demote_time;
+        dtime *= 1000000/HZ; /* demote time in uSec */
+        if (!test_bit(GLF_DEMOTE, &gl->gl_flags))
+                dtime = 0;
+        gfs2_print_dbg(seq, "G:  s:%s n:%u/%llu f:%s t:%s d:%s/%llu l:%d a:%d r:%d\n",
+                  state2str(gl->gl_state),
+                  gl->gl_name.ln_type,
+                  (unsigned long long)gl->gl_name.ln_number,
+                  gflags2str(gflags_buf, &gl->gl_flags),
+                  state2str(gl->gl_target),
+                  state2str(gl->gl_demote_state), dtime,
+                  atomic_read(&gl->gl_lvb_count),
+                  atomic_read(&gl->gl_ail_count),
+                  atomic_read(&gl->gl_ref));
-        print_dbg(gi, "Glock 0x%p (%u, 0x%llx)\n", gl, gl->gl_name.ln_type,
-                   (unsigned long long)gl->gl_name.ln_number);
-        print_dbg(gi, "  gl_flags =");
-        for (x = 0; x < 32; x++) {
-                if (test_bit(x, &gl->gl_flags))
-                        print_dbg(gi, " %u", x);
-        }
-        if (!test_bit(GLF_LOCK, &gl->gl_flags))
-                print_dbg(gi, " (unlocked)");
-        print_dbg(gi, " \n");
-        print_dbg(gi, "  gl_ref = %d\n", atomic_read(&gl->gl_ref));
-        print_dbg(gi, "  gl_state = %u\n", gl->gl_state);
-        if (gl->gl_owner_pid) {
-                gl_owner = pid_task(gl->gl_owner_pid, PIDTYPE_PID);
-                if (gl_owner)
-                        print_dbg(gi, "  gl_owner = pid %d (%s)\n",
-                                  pid_nr(gl->gl_owner_pid), gl_owner->comm);
-                else
-                        print_dbg(gi, "  gl_owner = %d (ended)\n",
-                                  pid_nr(gl->gl_owner_pid));
-        } else
-                print_dbg(gi, "  gl_owner = -1\n");
-        print_dbg(gi, "  gl_ip = %lu\n", gl->gl_ip);
-        print_dbg(gi, "  req_gh = %s\n", (gl->gl_req_gh) ? "yes" : "no");
-        print_dbg(gi, "  lvb_count = %d\n", atomic_read(&gl->gl_lvb_count));
-        print_dbg(gi, "  object = %s\n", (gl->gl_object) ? "yes" : "no");
-        print_dbg(gi, "  reclaim = %s\n",
-                   (list_empty(&gl->gl_reclaim)) ? "no" : "yes");
-        if (gl->gl_aspace)
-                print_dbg(gi, "  aspace = 0x%p nrpages = %lu\n", gl->gl_aspace,
-                           gl->gl_aspace->i_mapping->nrpages);
-        else
-                print_dbg(gi, "  aspace = no\n");
-        print_dbg(gi, "  ail = %d\n", atomic_read(&gl->gl_ail_count));
-        if (gl->gl_req_gh) {
-                error = dump_holder(gi, "Request", gl->gl_req_gh);
-                if (error)
-                        goto out;
-        }
        list_for_each_entry(gh, &gl->gl_holders, gh_list) {
-                error = dump_holder(gi, "Holder", gh);
+                error = dump_holder(seq, gh);
                if (error)
                        goto out;
        }
-        list_for_each_entry(gh, &gl->gl_waiters1, gh_list) {
+        if (gl->gl_state != LM_ST_UNLOCKED && glops->go_dump)
-                error = dump_holder(gi, "Waiter1", gh);
+                error = glops->go_dump(seq, gl);
-                if (error)
-                        goto out;
-        }
-        list_for_each_entry(gh, &gl->gl_waiters3, gh_list) {
-                error = dump_holder(gi, "Waiter3", gh);
-                if (error)
-                        goto out;
-        }
-        if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
-                print_dbg(gi, "  Demotion req to state %u (%llu uS ago)\n",
-                          gl->gl_demote_state, (unsigned long long)
-                          (jiffies - gl->gl_demote_time)*(1000000/HZ));
-        }
-        if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object) {
-                if (!test_bit(GLF_LOCK, &gl->gl_flags) &&
-                        list_empty(&gl->gl_holders)) {
-                        error = dump_inode(gi, gl->gl_object);
-                        if (error)
-                                goto out;
-                } else {
-                        error = -ENOBUFS;
-                        print_dbg(gi, "  Inode: busy\n");
-                }
-        }
-        error = 0;
 out:
-        spin_unlock(&gl->gl_spin);
        return error;
 }
+static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
+{
+        int ret;
+        spin_lock(&gl->gl_spin);
+        ret = __dump_glock(seq, gl);
+        spin_unlock(&gl->gl_spin);
+        return ret;
+}
 /**
 * gfs2_dump_lockstate - print out the current lockstate
 * @sdp: the filesystem
@@ -2086,7 +1806,7 @@ void gfs2_glock_exit(void)
 module_param(scand_secs, uint, S_IRUGO|S_IWUSR);
 MODULE_PARM_DESC(scand_secs, "The number of seconds between scand runs");
-static int gfs2_glock_iter_next(struct glock_iter *gi)
+static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
 {
        struct gfs2_glock *gl;
@@ -2104,7 +1824,7 @@ restart:
                gfs2_glock_put(gl);
        if (gl && gi->gl == NULL)
                gi->hash++;
-        while(gi->gl == NULL) {
+        while (gi->gl == NULL) {
                if (gi->hash >= GFS2_GL_HASH_SIZE)
                        return 1;
                read_lock(gl_lock_addr(gi->hash));
@@ -2122,58 +1842,34 @@ restart:
        return 0;
 }
-static void gfs2_glock_iter_free(struct glock_iter *gi)
+static void gfs2_glock_iter_free(struct gfs2_glock_iter *gi)
 {
        if (gi->gl)
                gfs2_glock_put(gi->gl);
-        kfree(gi);
-}
-static struct glock_iter *gfs2_glock_iter_init(struct gfs2_sbd *sdp)
-{
-        struct glock_iter *gi;
-        gi = kmalloc(sizeof (*gi), GFP_KERNEL);
-        if (!gi)
-                return NULL;
-        gi->sdp = sdp;
-        gi->hash = 0;
-        gi->seq = NULL;
        gi->gl = NULL;
-        memset(gi->string, 0, sizeof(gi->string));
-        if (gfs2_glock_iter_next(gi)) {
-                gfs2_glock_iter_free(gi);
-                return NULL;
-        }
-        return gi;
 }
-static void *gfs2_glock_seq_start(struct seq_file *file, loff_t *pos)
+static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos)
 {
-        struct glock_iter *gi;
+        struct gfs2_glock_iter *gi = seq->private;
        loff_t n = *pos;
-        gi = gfs2_glock_iter_init(file->private);
+        gi->hash = 0;
-        if (!gi)
-                return NULL;
-        while(n--) {
+        do {
                if (gfs2_glock_iter_next(gi)) {
                        gfs2_glock_iter_free(gi);
                        return NULL;
                }
-        }
+        } while (n--);
-        return gi;
+        return gi->gl;
 }
-static void *gfs2_glock_seq_next(struct seq_file *file, void *iter_ptr,
+static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr,
                                 loff_t *pos)
 {
-        struct glock_iter *gi = iter_ptr;
+        struct gfs2_glock_iter *gi = seq->private;
        (*pos)++;
@@ -2182,24 +1878,18 @@ static void *gfs2_glock_seq_next(struct seq_file *file, void *iter_ptr,
                return NULL;
        }
-        return gi;
+        return gi->gl;
 }
-static void gfs2_glock_seq_stop(struct seq_file *file, void *iter_ptr)
+static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr)
 {
-        struct glock_iter *gi = iter_ptr;
+        struct gfs2_glock_iter *gi = seq->private;
-        if (gi)
+        gfs2_glock_iter_free(gi);
-                gfs2_glock_iter_free(gi);
 }
-static int gfs2_glock_seq_show(struct seq_file *file, void *iter_ptr)
+static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr)
 {
-        struct glock_iter *gi = iter_ptr;
+        return dump_glock(seq, iter_ptr);
-        gi->seq = file;
-        dump_glock(gi, gi->gl);
-        return 0;
 }
 static const struct seq_operations gfs2_glock_seq_ops = {
@@ -2211,17 +1901,14 @@ static const struct seq_operations gfs2_glock_seq_ops = {
 static int gfs2_debugfs_open(struct inode *inode, struct file *file)
 {
-        struct seq_file *seq;
+        int ret = seq_open_private(file, &gfs2_glock_seq_ops,
-        int ret;
+                                   sizeof(struct gfs2_glock_iter));
+        if (ret == 0) {
-        ret = seq_open(file, &gfs2_glock_seq_ops);
+                struct seq_file *seq = file->private_data;
-        if (ret)
+                struct gfs2_glock_iter *gi = seq->private;
-                return ret;
+                gi->sdp = inode->i_private;
+        }
-        seq = file->private_data;
+        return ret;
-        seq->private = inode->i_private;
-        return 0;
 }
 static const struct file_operations gfs2_debug_fops = {
@@ -2229,7 +1916,7 @@ static const struct file_operations gfs2_debug_fops = {
        .open    = gfs2_debugfs_open,
        .read    = seq_read,
        .llseek  = seq_lseek,
-        .release = seq_release
+        .release = seq_release_private,
 };
 int gfs2_create_debugfs_file(struct gfs2_sbd *sdp)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index cdad3e6f8150..971d92af70fc 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -26,11 +26,8 @@
 #define GL_SKIP                 0x00000100
 #define GL_ATIME                0x00000200
 #define GL_NOCACHE              0x00000400
-#define GL_FLOCK                0x00000800
-#define GL_NOCANCEL             0x00001000
 #define GLR_TRYFAILED           13
-#define GLR_CANCELED            14
 static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
 {
@@ -41,6 +38,8 @@ static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *
        spin_lock(&gl->gl_spin);
        pid = task_pid(current);
        list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+                if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
+                        break;
                if (gh->gh_owner_pid == pid)
                        goto out;
        }
@@ -70,7 +69,7 @@ static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
 {
        int ret;
        spin_lock(&gl->gl_spin);
-        ret = test_bit(GLF_DEMOTE, &gl->gl_flags) || !list_empty(&gl->gl_waiters3);
+        ret = test_bit(GLF_DEMOTE, &gl->gl_flags);
        spin_unlock(&gl->gl_spin);
        return ret;
 }
@@ -98,6 +97,7 @@ int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
 int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
 void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
 void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
+void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
 /**
 * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock
@@ -130,10 +130,9 @@ int gfs2_lvb_hold(struct gfs2_glock *gl);
 void gfs2_lvb_unhold(struct gfs2_glock *gl);
 void gfs2_glock_cb(void *cb_data, unsigned int type, void *data);
 void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
 void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
-void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait);
+void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
 int __init gfs2_glock_init(void);
 void gfs2_glock_exit(void);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 07d84d16cda4..c6c318c2a0f6 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -13,6 +13,7 @@
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/lm_interface.h>
+#include <linux/bio.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -172,26 +173,6 @@ static void inode_go_sync(struct gfs2_glock *gl)
 }
 /**
- * inode_go_xmote_bh - After promoting/demoting a glock
- * @gl: the glock
- *
- */
-static void inode_go_xmote_bh(struct gfs2_glock *gl)
-{
-        struct gfs2_holder *gh = gl->gl_req_gh;
-        struct buffer_head *bh;
-        int error;
-        if (gl->gl_state != LM_ST_UNLOCKED &&
-            (!gh || !(gh->gh_flags & GL_SKIP))) {
-                error = gfs2_meta_read(gl, gl->gl_name.ln_number, 0, &bh);
-                if (!error)
-                        brelse(bh);
-        }
-}
-/**
 * inode_go_inval - prepare a inode glock to be released
 * @gl: the glock
 * @flags:
@@ -267,6 +248,26 @@ static int inode_go_lock(struct gfs2_holder *gh)
 }
 /**
+ * inode_go_dump - print information about an inode
+ * @seq: The iterator
+ * @ip: the inode
+ *
+ * Returns: 0 on success, -ENOBUFS when we run out of space
+ */
+static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
+{
+        const struct gfs2_inode *ip = gl->gl_object;
+        if (ip == NULL)
+                return 0;
+        gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%08lx\n",
+                  (unsigned long long)ip->i_no_formal_ino,
+                  (unsigned long long)ip->i_no_addr,
+                  IF2DT(ip->i_inode.i_mode), ip->i_flags);
+        return 0;
+}
+/**
 * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock
 * @gl: the glock
 *
@@ -306,6 +307,22 @@ static void rgrp_go_unlock(struct gfs2_holder *gh)
 }
 /**
+ * rgrp_go_dump - print out an rgrp
+ * @seq: The iterator
+ * @gl: The glock in question
+ *
+ */
+static int rgrp_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
+{
+        const struct gfs2_rgrpd *rgd = gl->gl_object;
+        if (rgd == NULL)
+                return 0;
+        gfs2_print_dbg(seq, " R: n:%llu\n", (unsigned long long)rgd->rd_addr);
+        return 0;
+}
+/**
 * trans_go_sync - promote/demote the transaction glock
 * @gl: the glock
 * @state: the requested state
@@ -330,7 +347,7 @@ static void trans_go_sync(struct gfs2_glock *gl)
 *
 */
-static void trans_go_xmote_bh(struct gfs2_glock *gl)
+static int trans_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh)
 {
        struct gfs2_sbd *sdp = gl->gl_sbd;
        struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
@@ -338,8 +355,7 @@ static void trans_go_xmote_bh(struct gfs2_glock *gl)
        struct gfs2_log_header_host head;
        int error;
-        if (gl->gl_state != LM_ST_UNLOCKED &&
+        if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
-            test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
                j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
                error = gfs2_find_jhead(sdp->sd_jdesc, &head);
@@ -354,6 +370,7 @@ static void trans_go_xmote_bh(struct gfs2_glock *gl)
                        gfs2_log_pointers_init(sdp, head.lh_blkno);
                }
        }
+        return 0;
 }
 /**
@@ -375,12 +392,12 @@ const struct gfs2_glock_operations gfs2_meta_glops = {
 const struct gfs2_glock_operations gfs2_inode_glops = {
        .go_xmote_th = inode_go_sync,
-        .go_xmote_bh = inode_go_xmote_bh,
        .go_inval = inode_go_inval,
        .go_demote_ok = inode_go_demote_ok,
        .go_lock = inode_go_lock,
+        .go_dump = inode_go_dump,
        .go_type = LM_TYPE_INODE,
-        .go_min_hold_time = HZ / 10,
+        .go_min_hold_time = HZ / 5,
 };
 const struct gfs2_glock_operations gfs2_rgrp_glops = {
@@ -389,8 +406,9 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
        .go_demote_ok = rgrp_go_demote_ok,
        .go_lock = rgrp_go_lock,
        .go_unlock = rgrp_go_unlock,
+        .go_dump = rgrp_go_dump,
        .go_type = LM_TYPE_RGRP,
-        .go_min_hold_time = HZ / 10,
+        .go_min_hold_time = HZ / 5,
 };
 const struct gfs2_glock_operations gfs2_trans_glops = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index eabe5eac41da..448697a5c462 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -77,7 +77,6 @@ struct gfs2_rgrp_host {
 struct gfs2_rgrpd {
        struct list_head rd_list;       /* Link with superblock */
        struct list_head rd_list_mru;
-        struct list_head rd_recent;     /* Recently used rgrps */
        struct gfs2_glock *rd_gl;       /* Glock for this rgrp */
        u64 rd_addr;                    /* grp block disk address */
        u64 rd_data0;                   /* first data location */
@@ -128,20 +127,20 @@ struct gfs2_bufdata {
 struct gfs2_glock_operations {
        void (*go_xmote_th) (struct gfs2_glock *gl);
-        void (*go_xmote_bh) (struct gfs2_glock *gl);
+        int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh);
        void (*go_inval) (struct gfs2_glock *gl, int flags);
        int (*go_demote_ok) (struct gfs2_glock *gl);
        int (*go_lock) (struct gfs2_holder *gh);
        void (*go_unlock) (struct gfs2_holder *gh);
+        int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
        const int go_type;
        const unsigned long go_min_hold_time;
 };
 enum {
        /* States */
-        HIF_HOLDER              = 6,
+        HIF_HOLDER              = 6,  /* Set for gh that "holds" the glock */
        HIF_FIRST               = 7,
-        HIF_ABORTED             = 9,
        HIF_WAIT                = 10,
 };
@@ -154,20 +153,20 @@ struct gfs2_holder {
        unsigned gh_flags;
        int gh_error;
-        unsigned long gh_iflags;
+        unsigned long gh_iflags; /* HIF_... */
        unsigned long gh_ip;
 };
 enum {
-        GLF_LOCK                = 1,
+        GLF_LOCK                        = 1,
-        GLF_STICKY              = 2,
+        GLF_STICKY                      = 2,
-        GLF_DEMOTE              = 3,
+        GLF_DEMOTE                      = 3,
-        GLF_PENDING_DEMOTE      = 4,
+        GLF_PENDING_DEMOTE              = 4,
-        GLF_DIRTY               = 5,
+        GLF_DEMOTE_IN_PROGRESS          = 5,
-        GLF_DEMOTE_IN_PROGRESS  = 6,
+        GLF_DIRTY                       = 6,
-        GLF_LFLUSH              = 7,
+        GLF_LFLUSH                      = 7,
-        GLF_WAITERS2            = 8,
+        GLF_INVALIDATE_IN_PROGRESS      = 8,
-        GLF_CONV_DEADLK         = 9,
+        GLF_REPLY_PENDING               = 9,
 };
 struct gfs2_glock {
@@ -179,19 +178,14 @@ struct gfs2_glock {
        spinlock_t gl_spin;
        unsigned int gl_state;
+        unsigned int gl_target;
+        unsigned int gl_reply;
        unsigned int gl_hash;
        unsigned int gl_demote_state; /* state requested by remote node */
        unsigned long gl_demote_time; /* time of first demote request */
-        struct pid *gl_owner_pid;
-        unsigned long gl_ip;
        struct list_head gl_holders;
-        struct list_head gl_waiters1;   /* HIF_MUTEX */
-        struct list_head gl_waiters3;   /* HIF_PROMOTE */
        const struct gfs2_glock_operations *gl_ops;
-        struct gfs2_holder *gl_req_gh;
        void *gl_lock;
        char *gl_lvb;
        atomic_t gl_lvb_count;
@@ -427,7 +421,6 @@ struct gfs2_tune {
        unsigned int gt_quota_quantum; /* Secs between syncs to quota file */
        unsigned int gt_atime_quantum; /* Min secs between atime updates */
        unsigned int gt_new_files_jdata;
-        unsigned int gt_new_files_directio;
        unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
        unsigned int gt_stall_secs; /* Detects trouble! */
        unsigned int gt_complain_secs;
@@ -534,7 +527,6 @@ struct gfs2_sbd {
        struct mutex sd_rindex_mutex;
        struct list_head sd_rindex_list;
        struct list_head sd_rindex_mru_list;
-        struct list_head sd_rindex_recent_list;
        struct gfs2_rgrpd *sd_rindex_forward;
        unsigned int sd_rgrps;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 09453d057e41..6da0ab355b8a 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -504,7 +504,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
        }
        if (!is_root) {
-                error = permission(dir, MAY_EXEC, NULL);
+                error = gfs2_permission(dir, MAY_EXEC);
                if (error)
                        goto out;
        }
@@ -667,7 +667,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
 {
        int error;
-        error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL);
+        error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
        if (error)
                return error;
@@ -789,13 +789,8 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
                if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_JDATA) ||
                    gfs2_tune_get(sdp, gt_new_files_jdata))
                        di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
-                if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_DIRECTIO) ||
-                    gfs2_tune_get(sdp, gt_new_files_directio))
-                        di->di_flags |= cpu_to_be32(GFS2_DIF_DIRECTIO);
        } else if (S_ISDIR(mode)) {
                di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
-                                            GFS2_DIF_INHERIT_DIRECTIO);
-                di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
                                            GFS2_DIF_INHERIT_JDATA);
        }
@@ -1134,7 +1129,7 @@ int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
        if (IS_APPEND(&dip->i_inode))
                return -EPERM;
-        error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL);
+        error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
        if (error)
                return error;
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 580da454b38f..6074c2506f75 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -72,7 +72,6 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
 }
-void gfs2_inode_attr_in(struct gfs2_inode *ip);
 void gfs2_set_iop(struct inode *inode);
 struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
                                u64 no_addr, u64 no_formal_ino,
@@ -91,6 +90,7 @@ int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
                struct gfs2_inode *ip);
 int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
                   const struct gfs2_inode *ip);
+int gfs2_permission(struct inode *inode, int mask);
 int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to);
 int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len);
 int gfs2_glock_nq_atime(struct gfs2_holder *gh);
diff --git a/fs/gfs2/locking.c b/fs/gfs2/locking.c
index 663fee728783..523243a13a21 100644
--- a/fs/gfs2/locking.c
+++ b/fs/gfs2/locking.c
@@ -23,12 +23,54 @@ struct lmh_wrapper {
        const struct lm_lockops *lw_ops;
 };
+static int nolock_mount(char *table_name, char *host_data,
+                        lm_callback_t cb, void *cb_data,
+                        unsigned int min_lvb_size, int flags,
+                        struct lm_lockstruct *lockstruct,
+                        struct kobject *fskobj);
 /* List of registered low-level locking protocols.  A file system selects one
   of them by name at mount time, e.g. lock_nolock, lock_dlm. */
+static const struct lm_lockops nolock_ops = {
+        .lm_proto_name = "lock_nolock",
+        .lm_mount = nolock_mount,
+};
+static struct lmh_wrapper nolock_proto  = {
+        .lw_list = LIST_HEAD_INIT(nolock_proto.lw_list),
+        .lw_ops = &nolock_ops,
+};
 static LIST_HEAD(lmh_list);
 static DEFINE_MUTEX(lmh_lock);
+static int nolock_mount(char *table_name, char *host_data,
+                        lm_callback_t cb, void *cb_data,
+                        unsigned int min_lvb_size, int flags,
+                        struct lm_lockstruct *lockstruct,
+                        struct kobject *fskobj)
+{
+        char *c;
+        unsigned int jid;
+        c = strstr(host_data, "jid=");
+        if (!c)
+                jid = 0;
+        else {
+                c += 4;
+                sscanf(c, "%u", &jid);
+        }
+        lockstruct->ls_jid = jid;
+        lockstruct->ls_first = 1;
+        lockstruct->ls_lvb_size = min_lvb_size;
+        lockstruct->ls_ops = &nolock_ops;
+        lockstruct->ls_flags = LM_LSFLAG_LOCAL;
+        return 0;
+}
 /**
 * gfs2_register_lockproto - Register a low-level locking protocol
 * @proto: the protocol definition
@@ -116,9 +158,13 @@ int gfs2_mount_lockproto(char *proto_name, char *table_name, char *host_data,
        int try = 0;
        int error, found;
 retry:
        mutex_lock(&lmh_lock);
+        if (list_empty(&nolock_proto.lw_list))
+                list_add(&nolock_proto.lw_list, &lmh_list);
        found = 0;
        list_for_each_entry(lw, &lmh_list, lw_list) {
                if (!strcmp(lw->lw_ops->lm_proto_name, proto_name)) {
@@ -139,7 +185,8 @@ retry:
                goto out;
        }
-        if (!try_module_get(lw->lw_ops->lm_owner)) {
+        if (lw->lw_ops->lm_owner &&
+            !try_module_get(lw->lw_ops->lm_owner)) {
                try = 0;
                mutex_unlock(&lmh_lock);
                msleep(1000);
@@ -158,7 +205,8 @@ out:
 void gfs2_unmount_lockproto(struct lm_lockstruct *lockstruct)
 {
        mutex_lock(&lmh_lock);
-        lockstruct->ls_ops->lm_unmount(lockstruct->ls_lockspace);
+        if (lockstruct->ls_ops->lm_unmount)
+                lockstruct->ls_ops->lm_unmount(lockstruct->ls_lockspace);
        if (lockstruct->ls_ops->lm_owner)
                module_put(lockstruct->ls_ops->lm_owner);
        mutex_unlock(&lmh_lock);
diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c
index cf7ea8abec87..2482c9047505 100644
--- a/fs/gfs2/locking/dlm/lock.c
+++ b/fs/gfs2/locking/dlm/lock.c
@@ -11,46 +11,60 @@
 static char junk_lvb[GDLM_LVB_SIZE];
-static void queue_complete(struct gdlm_lock *lp)
+/* convert dlm lock-mode to gfs lock-state */
+static s16 gdlm_make_lmstate(s16 dlmmode)
 {
-        struct gdlm_ls *ls = lp->ls;
+        switch (dlmmode) {
+        case DLM_LOCK_IV:
+        case DLM_LOCK_NL:
+                return LM_ST_UNLOCKED;
+        case DLM_LOCK_EX:
+                return LM_ST_EXCLUSIVE;
+        case DLM_LOCK_CW:
+                return LM_ST_DEFERRED;
+        case DLM_LOCK_PR:
+                return LM_ST_SHARED;
+        }
+        gdlm_assert(0, "unknown DLM mode %d", dlmmode);
+        return -1;
+}
-        clear_bit(LFL_ACTIVE, &lp->flags);
+/* A lock placed on this queue is re-submitted to DLM as soon as the lock_dlm
+   thread gets to it. */
+static void queue_submit(struct gdlm_lock *lp)
+{
+        struct gdlm_ls *ls = lp->ls;
        spin_lock(&ls->async_lock);
-        list_add_tail(&lp->clist, &ls->complete);
+        list_add_tail(&lp->delay_list, &ls->submit);
        spin_unlock(&ls->async_lock);
        wake_up(&ls->thread_wait);
 }
-static inline void gdlm_ast(void *astarg)
+static void wake_up_ast(struct gdlm_lock *lp)
 {
-        queue_complete(astarg);
+        clear_bit(LFL_AST_WAIT, &lp->flags);
+        smp_mb__after_clear_bit();
+        wake_up_bit(&lp->flags, LFL_AST_WAIT);
 }
-static inline void gdlm_bast(void *astarg, int mode)
+static void gdlm_delete_lp(struct gdlm_lock *lp)
 {
-        struct gdlm_lock *lp = astarg;
        struct gdlm_ls *ls = lp->ls;
-        if (!mode) {
-                printk(KERN_INFO "lock_dlm: bast mode zero %x,%llx\n",
-                        lp->lockname.ln_type,
-                        (unsigned long long)lp->lockname.ln_number);
-                return;
-        }
        spin_lock(&ls->async_lock);
-        if (!lp->bast_mode) {
+        if (!list_empty(&lp->delay_list))
-                list_add_tail(&lp->blist, &ls->blocking);
+                list_del_init(&lp->delay_list);
-                lp->bast_mode = mode;
+        ls->all_locks_count--;
-        } else if (lp->bast_mode < mode)
-                lp->bast_mode = mode;
        spin_unlock(&ls->async_lock);
-        wake_up(&ls->thread_wait);
+        kfree(lp);
 }
-void gdlm_queue_delayed(struct gdlm_lock *lp)
+static void gdlm_queue_delayed(struct gdlm_lock *lp)
 {
        struct gdlm_ls *ls = lp->ls;
@@ -59,6 +73,236 @@ void gdlm_queue_delayed(struct gdlm_lock *lp)
        spin_unlock(&ls->async_lock);
 }
+static void process_complete(struct gdlm_lock *lp)
+{
+        struct gdlm_ls *ls = lp->ls;
+        struct lm_async_cb acb;
+        memset(&acb, 0, sizeof(acb));
+        if (lp->lksb.sb_status == -DLM_ECANCEL) {
+                log_info("complete dlm cancel %x,%llx flags %lx",
+                         lp->lockname.ln_type,
+                         (unsigned long long)lp->lockname.ln_number,
+                         lp->flags);
+                lp->req = lp->cur;
+                acb.lc_ret |= LM_OUT_CANCELED;
+                if (lp->cur == DLM_LOCK_IV)
+                        lp->lksb.sb_lkid = 0;
+                goto out;
+        }
+        if (test_and_clear_bit(LFL_DLM_UNLOCK, &lp->flags)) {
+                if (lp->lksb.sb_status != -DLM_EUNLOCK) {
+                        log_info("unlock sb_status %d %x,%llx flags %lx",
+                                 lp->lksb.sb_status, lp->lockname.ln_type,
+                                 (unsigned long long)lp->lockname.ln_number,
+                                 lp->flags);
+                        return;
+                }
+                lp->cur = DLM_LOCK_IV;
+                lp->req = DLM_LOCK_IV;
+                lp->lksb.sb_lkid = 0;
+                if (test_and_clear_bit(LFL_UNLOCK_DELETE, &lp->flags)) {
+                        gdlm_delete_lp(lp);
+                        return;
+                }
+                goto out;
+        }
+        if (lp->lksb.sb_flags & DLM_SBF_VALNOTVALID)
+                memset(lp->lksb.sb_lvbptr, 0, GDLM_LVB_SIZE);
+        if (lp->lksb.sb_flags & DLM_SBF_ALTMODE) {
+                if (lp->req == DLM_LOCK_PR)
+                        lp->req = DLM_LOCK_CW;
+                else if (lp->req == DLM_LOCK_CW)
+                        lp->req = DLM_LOCK_PR;
+        }
+        /*
+         * A canceled lock request.  The lock was just taken off the delayed
+         * list and was never even submitted to dlm.
+         */
+        if (test_and_clear_bit(LFL_CANCEL, &lp->flags)) {
+                log_info("complete internal cancel %x,%llx",
+                         lp->lockname.ln_type,
+                         (unsigned long long)lp->lockname.ln_number);
+                lp->req = lp->cur;
+                acb.lc_ret |= LM_OUT_CANCELED;
+                goto out;
+        }
+        /*
+         * An error occured.
+         */
+        if (lp->lksb.sb_status) {
+                /* a "normal" error */
+                if ((lp->lksb.sb_status == -EAGAIN) &&
+                    (lp->lkf & DLM_LKF_NOQUEUE)) {
+                        lp->req = lp->cur;
+                        if (lp->cur == DLM_LOCK_IV)
+                                lp->lksb.sb_lkid = 0;
+                        goto out;
+                }
+                /* this could only happen with cancels I think */
+                log_info("ast sb_status %d %x,%llx flags %lx",
+                         lp->lksb.sb_status, lp->lockname.ln_type,
+                         (unsigned long long)lp->lockname.ln_number,
+                         lp->flags);
+                return;
+        }
+        /*
+         * This is an AST for an EX->EX conversion for sync_lvb from GFS.
+         */
+        if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) {
+                wake_up_ast(lp);
+                return;
+        }
+        /*
+         * A lock has been demoted to NL because it initially completed during
+         * BLOCK_LOCKS.  Now it must be requested in the originally requested
+         * mode.
+         */
+        if (test_and_clear_bit(LFL_REREQUEST, &lp->flags)) {
+                gdlm_assert(lp->req == DLM_LOCK_NL, "%x,%llx",
+                            lp->lockname.ln_type,
+                            (unsigned long long)lp->lockname.ln_number);
+                gdlm_assert(lp->prev_req > DLM_LOCK_NL, "%x,%llx",
+                            lp->lockname.ln_type,
+                            (unsigned long long)lp->lockname.ln_number);
+                lp->cur = DLM_LOCK_NL;
+                lp->req = lp->prev_req;
+                lp->prev_req = DLM_LOCK_IV;
+                lp->lkf &= ~DLM_LKF_CONVDEADLK;
+                set_bit(LFL_NOCACHE, &lp->flags);
+                if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
+                    !test_bit(LFL_NOBLOCK, &lp->flags))
+                        gdlm_queue_delayed(lp);
+                else
+                        queue_submit(lp);
+                return;
+        }
+        /*
+         * A request is granted during dlm recovery.  It may be granted
+         * because the locks of a failed node were cleared.  In that case,
+         * there may be inconsistent data beneath this lock and we must wait
+         * for recovery to complete to use it.  When gfs recovery is done this
+         * granted lock will be converted to NL and then reacquired in this
+         * granted state.
+         */
+        if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
+            !test_bit(LFL_NOBLOCK, &lp->flags) &&
+            lp->req != DLM_LOCK_NL) {
+                lp->cur = lp->req;
+                lp->prev_req = lp->req;
+                lp->req = DLM_LOCK_NL;
+                lp->lkf |= DLM_LKF_CONVERT;
+                lp->lkf &= ~DLM_LKF_CONVDEADLK;
+                log_debug("rereq %x,%llx id %x %d,%d",
+                          lp->lockname.ln_type,
+                          (unsigned long long)lp->lockname.ln_number,
+                          lp->lksb.sb_lkid, lp->cur, lp->req);
+                set_bit(LFL_REREQUEST, &lp->flags);
+                queue_submit(lp);
+                return;
+        }
+        /*
+         * DLM demoted the lock to NL before it was granted so GFS must be
+         * told it cannot cache data for this lock.
+         */
+        if (lp->lksb.sb_flags & DLM_SBF_DEMOTED)
+                set_bit(LFL_NOCACHE, &lp->flags);
+out:
+        /*
+         * This is an internal lock_dlm lock
+         */
+        if (test_bit(LFL_INLOCK, &lp->flags)) {
+                clear_bit(LFL_NOBLOCK, &lp->flags);
+                lp->cur = lp->req;
+                wake_up_ast(lp);
+                return;
+        }
+        /*
+         * Normal completion of a lock request.  Tell GFS it now has the lock.
+         */
+        clear_bit(LFL_NOBLOCK, &lp->flags);
+        lp->cur = lp->req;
+        acb.lc_name = lp->lockname;
+        acb.lc_ret |= gdlm_make_lmstate(lp->cur);
+        ls->fscb(ls->sdp, LM_CB_ASYNC, &acb);
+}
+static void gdlm_ast(void *astarg)
+{
+        struct gdlm_lock *lp = astarg;
+        clear_bit(LFL_ACTIVE, &lp->flags);
+        process_complete(lp);
+}
+static void process_blocking(struct gdlm_lock *lp, int bast_mode)
+{
+        struct gdlm_ls *ls = lp->ls;
+        unsigned int cb = 0;
+        switch (gdlm_make_lmstate(bast_mode)) {
+        case LM_ST_EXCLUSIVE:
+                cb = LM_CB_NEED_E;
+                break;
+        case LM_ST_DEFERRED:
+                cb = LM_CB_NEED_D;
+                break;
+        case LM_ST_SHARED:
+                cb = LM_CB_NEED_S;
+                break;
+        default:
+                gdlm_assert(0, "unknown bast mode %u", bast_mode);
+        }
+        ls->fscb(ls->sdp, cb, &lp->lockname);
+}
+static void gdlm_bast(void *astarg, int mode)
+{
+        struct gdlm_lock *lp = astarg;
+        if (!mode) {
+                printk(KERN_INFO "lock_dlm: bast mode zero %x,%llx\n",
+                        lp->lockname.ln_type,
+                        (unsigned long long)lp->lockname.ln_number);
+                return;
+        }
+        process_blocking(lp, mode);
+}
 /* convert gfs lock-state to dlm lock-mode */
 static s16 make_mode(s16 lmstate)
@@ -77,24 +321,6 @@ static s16 make_mode(s16 lmstate)
        return -1;
 }
-/* convert dlm lock-mode to gfs lock-state */
-s16 gdlm_make_lmstate(s16 dlmmode)
-{
-        switch (dlmmode) {
-        case DLM_LOCK_IV:
-        case DLM_LOCK_NL:
-                return LM_ST_UNLOCKED;
-        case DLM_LOCK_EX:
-                return LM_ST_EXCLUSIVE;
-        case DLM_LOCK_CW:
-                return LM_ST_DEFERRED;
-        case DLM_LOCK_PR:
-                return LM_ST_SHARED;
-        }
-        gdlm_assert(0, "unknown DLM mode %d", dlmmode);
-        return -1;
-}
 /* verify agreement with GFS on the current lock state, NB: DLM_LOCK_NL and
   DLM_LOCK_IV are both considered LM_ST_UNLOCKED by GFS. */
@@ -134,14 +360,6 @@ static inline unsigned int make_flags(struct gdlm_lock *lp,
        if (lp->lksb.sb_lkid != 0) {
                lkf |= DLM_LKF_CONVERT;
-                /* Conversion deadlock avoidance by DLM */
-                if (!(lp->ls->fsflags & LM_MFLAG_CONV_NODROP) &&
-                    !test_bit(LFL_FORCE_PROMOTE, &lp->flags) &&
-                    !(lkf & DLM_LKF_NOQUEUE) &&
-                    cur > DLM_LOCK_NL && req > DLM_LOCK_NL && cur != req)
-                        lkf |= DLM_LKF_CONVDEADLK;
        }
        if (lp->lvb)
@@ -173,14 +391,9 @@ static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
        make_strname(name, &lp->strname);
        lp->ls = ls;
        lp->cur = DLM_LOCK_IV;
-        lp->lvb = NULL;
-        lp->hold_null = NULL;
-        INIT_LIST_HEAD(&lp->clist);
-        INIT_LIST_HEAD(&lp->blist);
        INIT_LIST_HEAD(&lp->delay_list);
        spin_lock(&ls->async_lock);
-        list_add(&lp->all_list, &ls->all_locks);
        ls->all_locks_count++;
        spin_unlock(&ls->async_lock);
@@ -188,26 +401,6 @@ static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
        return 0;
 }
-void gdlm_delete_lp(struct gdlm_lock *lp)
-{
-        struct gdlm_ls *ls = lp->ls;
-        spin_lock(&ls->async_lock);
-        if (!list_empty(&lp->clist))
-                list_del_init(&lp->clist);
-        if (!list_empty(&lp->blist))
-                list_del_init(&lp->blist);
-        if (!list_empty(&lp->delay_list))
-                list_del_init(&lp->delay_list);
-        gdlm_assert(!list_empty(&lp->all_list), "%x,%llx", lp->lockname.ln_type,
-                    (unsigned long long)lp->lockname.ln_number);
-        list_del_init(&lp->all_list);
-        ls->all_locks_count--;
-        spin_unlock(&ls->async_lock);
-        kfree(lp);
-}
 int gdlm_get_lock(void *lockspace, struct lm_lockname *name,
                  void **lockp)
 {
@@ -261,7 +454,7 @@ unsigned int gdlm_do_lock(struct gdlm_lock *lp)
        if ((error == -EAGAIN) && (lp->lkf & DLM_LKF_NOQUEUE)) {
                lp->lksb.sb_status = -EAGAIN;
-                queue_complete(lp);
+                gdlm_ast(lp);
                error = 0;
        }
@@ -308,6 +501,12 @@ unsigned int gdlm_lock(void *lock, unsigned int cur_state,
 {
        struct gdlm_lock *lp = lock;
+        if (req_state == LM_ST_UNLOCKED)
+                return gdlm_unlock(lock, cur_state);
+        if (req_state == LM_ST_UNLOCKED)
+                return gdlm_unlock(lock, cur_state);
        clear_bit(LFL_DLM_CANCEL, &lp->flags);
        if (flags & LM_FLAG_NOEXP)
                set_bit(LFL_NOBLOCK, &lp->flags);
@@ -351,7 +550,7 @@ void gdlm_cancel(void *lock)
        if (delay_list) {
                set_bit(LFL_CANCEL, &lp->flags);
                set_bit(LFL_ACTIVE, &lp->flags);
-                queue_complete(lp);
+                gdlm_ast(lp);
                return;
        }
@@ -507,22 +706,3 @@ void gdlm_submit_delayed(struct gdlm_ls *ls)
        wake_up(&ls->thread_wait);
 }
-int gdlm_release_all_locks(struct gdlm_ls *ls)
-{
-        struct gdlm_lock *lp, *safe;
-        int count = 0;
-        spin_lock(&ls->async_lock);
-        list_for_each_entry_safe(lp, safe, &ls->all_locks, all_list) {
-                list_del_init(&lp->all_list);
-                if (lp->lvb && lp->lvb != junk_lvb)
-                        kfree(lp->lvb);
-                kfree(lp);
-                count++;
-        }
-        spin_unlock(&ls->async_lock);
-        return count;
-}
diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
index a243cf69c54e..3c98e7c6f93b 100644
--- a/fs/gfs2/locking/dlm/lock_dlm.h
+++ b/fs/gfs2/locking/dlm/lock_dlm.h
@@ -72,19 +72,12 @@ struct gdlm_ls {
        int                     recover_jid_done;
        int                     recover_jid_status;
        spinlock_t              async_lock;
-        struct list_head        complete;
-        struct list_head        blocking;
        struct list_head        delayed;
        struct list_head        submit;
-        struct list_head        all_locks;
        u32             all_locks_count;
        wait_queue_head_t       wait_control;
-        struct task_struct      *thread1;
+        struct task_struct      *thread;
-        struct task_struct      *thread2;
        wait_queue_head_t       thread_wait;
-        unsigned long           drop_time;
-        int                     drop_locks_count;
-        int                     drop_locks_period;
 };
 enum {
@@ -117,12 +110,7 @@ struct gdlm_lock {
        u32                     lkf;            /* dlm flags DLM_LKF_ */
        unsigned long           flags;          /* lock_dlm flags LFL_ */
-        int                     bast_mode;      /* protected by async_lock */
-        struct list_head        clist;          /* complete */
-        struct list_head        blist;          /* blocking */
        struct list_head        delay_list;     /* delayed */
-        struct list_head        all_list;       /* all locks for the fs */
        struct gdlm_lock        *hold_null;     /* NL lock for hold_lvb */
 };
@@ -159,11 +147,7 @@ void gdlm_release_threads(struct gdlm_ls *);
 /* lock.c */
-s16 gdlm_make_lmstate(s16);
-void gdlm_queue_delayed(struct gdlm_lock *);
 void gdlm_submit_delayed(struct gdlm_ls *);
-int gdlm_release_all_locks(struct gdlm_ls *);
-void gdlm_delete_lp(struct gdlm_lock *);
 unsigned int gdlm_do_lock(struct gdlm_lock *);
 int gdlm_get_lock(void *, struct lm_lockname *, void **);
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
index 470bdf650b50..09d78c216f48 100644
--- a/fs/gfs2/locking/dlm/mount.c
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -22,22 +22,14 @@ static struct gdlm_ls *init_gdlm(lm_callback_t cb, struct gfs2_sbd *sdp,
        if (!ls)
                return NULL;
-        ls->drop_locks_count = GDLM_DROP_COUNT;
-        ls->drop_locks_period = GDLM_DROP_PERIOD;
        ls->fscb = cb;
        ls->sdp = sdp;
        ls->fsflags = flags;
        spin_lock_init(&ls->async_lock);
-        INIT_LIST_HEAD(&ls->complete);
-        INIT_LIST_HEAD(&ls->blocking);
        INIT_LIST_HEAD(&ls->delayed);
        INIT_LIST_HEAD(&ls->submit);
-        INIT_LIST_HEAD(&ls->all_locks);
        init_waitqueue_head(&ls->thread_wait);
        init_waitqueue_head(&ls->wait_control);
-        ls->thread1 = NULL;
-        ls->thread2 = NULL;
-        ls->drop_time = jiffies;
        ls->jid = -1;
        strncpy(buf, table_name, 256);
@@ -180,7 +172,6 @@ out:
 static void gdlm_unmount(void *lockspace)
 {
        struct gdlm_ls *ls = lockspace;
-        int rv;
        log_debug("unmount flags %lx", ls->flags);
@@ -194,9 +185,7 @@ static void gdlm_unmount(void *lockspace)
        gdlm_kobject_release(ls);
        dlm_release_lockspace(ls->dlm_lockspace, 2);
        gdlm_release_threads(ls);
-        rv = gdlm_release_all_locks(ls);
+        BUG_ON(ls->all_locks_count);
-        if (rv)
-                log_info("gdlm_unmount: %d stray locks freed", rv);
 out:
        kfree(ls);
 }
@@ -232,7 +221,6 @@ static void gdlm_withdraw(void *lockspace)
        dlm_release_lockspace(ls->dlm_lockspace, 2);
        gdlm_release_threads(ls);
-        gdlm_release_all_locks(ls);
        gdlm_kobject_release(ls);
 }
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index a4ff271df9ee..4ec571c3d8a9 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -114,17 +114,6 @@ static ssize_t recover_status_show(struct gdlm_ls *ls, char *buf)
        return sprintf(buf, "%d\n", ls->recover_jid_status);
 }
-static ssize_t drop_count_show(struct gdlm_ls *ls, char *buf)
-{
-        return sprintf(buf, "%d\n", ls->drop_locks_count);
-}
-static ssize_t drop_count_store(struct gdlm_ls *ls, const char *buf, size_t len)
-{
-        ls->drop_locks_count = simple_strtol(buf, NULL, 0);
-        return len;
-}
 struct gdlm_attr {
        struct attribute attr;
        ssize_t (*show)(struct gdlm_ls *, char *);
@@ -144,7 +133,6 @@ GDLM_ATTR(first_done,     0444, first_done_show,     NULL);
 GDLM_ATTR(recover,        0644, recover_show,        recover_store);
 GDLM_ATTR(recover_done,   0444, recover_done_show,   NULL);
 GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
-GDLM_ATTR(drop_count,     0644, drop_count_show,     drop_count_store);
 static struct attribute *gdlm_attrs[] = {
        &gdlm_attr_proto_name.attr,
@@ -157,7 +145,6 @@ static struct attribute *gdlm_attrs[] = {
        &gdlm_attr_recover.attr,
        &gdlm_attr_recover_done.attr,
        &gdlm_attr_recover_status.attr,
-        &gdlm_attr_drop_count.attr,
        NULL,
 };
diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c
index e53db6fd28ab..38823efd698c 100644
--- a/fs/gfs2/locking/dlm/thread.c
+++ b/fs/gfs2/locking/dlm/thread.c
@@ -9,367 +9,60 @@
 #include "lock_dlm.h"
-/* A lock placed on this queue is re-submitted to DLM as soon as the lock_dlm
+static inline int no_work(struct gdlm_ls *ls)
-   thread gets to it. */
-static void queue_submit(struct gdlm_lock *lp)
-{
-        struct gdlm_ls *ls = lp->ls;
-        spin_lock(&ls->async_lock);
-        list_add_tail(&lp->delay_list, &ls->submit);
-        spin_unlock(&ls->async_lock);
-        wake_up(&ls->thread_wait);
-}
-static void process_blocking(struct gdlm_lock *lp, int bast_mode)
-{
-        struct gdlm_ls *ls = lp->ls;
-        unsigned int cb = 0;
-        switch (gdlm_make_lmstate(bast_mode)) {
-        case LM_ST_EXCLUSIVE:
-                cb = LM_CB_NEED_E;
-                break;
-        case LM_ST_DEFERRED:
-                cb = LM_CB_NEED_D;
-                break;
-        case LM_ST_SHARED:
-                cb = LM_CB_NEED_S;
-                break;
-        default:
-                gdlm_assert(0, "unknown bast mode %u", lp->bast_mode);
-        }
-        ls->fscb(ls->sdp, cb, &lp->lockname);
-}
-static void wake_up_ast(struct gdlm_lock *lp)
-{
-        clear_bit(LFL_AST_WAIT, &lp->flags);
-        smp_mb__after_clear_bit();
-        wake_up_bit(&lp->flags, LFL_AST_WAIT);
-}
-static void process_complete(struct gdlm_lock *lp)
-{
-        struct gdlm_ls *ls = lp->ls;
-        struct lm_async_cb acb;
-        s16 prev_mode = lp->cur;
-        memset(&acb, 0, sizeof(acb));
-        if (lp->lksb.sb_status == -DLM_ECANCEL) {
-                log_info("complete dlm cancel %x,%llx flags %lx",
-                         lp->lockname.ln_type,
-                         (unsigned long long)lp->lockname.ln_number,
-                         lp->flags);
-                lp->req = lp->cur;
-                acb.lc_ret |= LM_OUT_CANCELED;
-                if (lp->cur == DLM_LOCK_IV)
-                        lp->lksb.sb_lkid = 0;
-                goto out;
-        }
-        if (test_and_clear_bit(LFL_DLM_UNLOCK, &lp->flags)) {
-                if (lp->lksb.sb_status != -DLM_EUNLOCK) {
-                        log_info("unlock sb_status %d %x,%llx flags %lx",
-                                 lp->lksb.sb_status, lp->lockname.ln_type,
-                                 (unsigned long long)lp->lockname.ln_number,
-                                 lp->flags);
-                        return;
-                }
-                lp->cur = DLM_LOCK_IV;
-                lp->req = DLM_LOCK_IV;
-                lp->lksb.sb_lkid = 0;
-                if (test_and_clear_bit(LFL_UNLOCK_DELETE, &lp->flags)) {
-                        gdlm_delete_lp(lp);
-                        return;
-                }
-                goto out;
-        }
-        if (lp->lksb.sb_flags & DLM_SBF_VALNOTVALID)
-                memset(lp->lksb.sb_lvbptr, 0, GDLM_LVB_SIZE);
-        if (lp->lksb.sb_flags & DLM_SBF_ALTMODE) {
-                if (lp->req == DLM_LOCK_PR)
-                        lp->req = DLM_LOCK_CW;
-                else if (lp->req == DLM_LOCK_CW)
-                        lp->req = DLM_LOCK_PR;
-        }
-        /*
-         * A canceled lock request.  The lock was just taken off the delayed
-         * list and was never even submitted to dlm.
-         */
-        if (test_and_clear_bit(LFL_CANCEL, &lp->flags)) {
-                log_info("complete internal cancel %x,%llx",
-                         lp->lockname.ln_type,
-                         (unsigned long long)lp->lockname.ln_number);
-                lp->req = lp->cur;
-                acb.lc_ret |= LM_OUT_CANCELED;
-                goto out;
-        }
-        /*
-         * An error occured.
-         */
-        if (lp->lksb.sb_status) {
-                /* a "normal" error */
-                if ((lp->lksb.sb_status == -EAGAIN) &&
-                    (lp->lkf & DLM_LKF_NOQUEUE)) {
-                        lp->req = lp->cur;
-                        if (lp->cur == DLM_LOCK_IV)
-                                lp->lksb.sb_lkid = 0;
-                        goto out;
-                }
-                /* this could only happen with cancels I think */
-                log_info("ast sb_status %d %x,%llx flags %lx",
-                         lp->lksb.sb_status, lp->lockname.ln_type,
-                         (unsigned long long)lp->lockname.ln_number,
-                         lp->flags);
-                if (lp->lksb.sb_status == -EDEADLOCK &&
-                    lp->ls->fsflags & LM_MFLAG_CONV_NODROP) {
-                        lp->req = lp->cur;
-                        acb.lc_ret |= LM_OUT_CONV_DEADLK;
-                        if (lp->cur == DLM_LOCK_IV)
-                                lp->lksb.sb_lkid = 0;
-                        goto out;
-                } else
-                        return;
-        }
-        /*
-         * This is an AST for an EX->EX conversion for sync_lvb from GFS.
-         */
-        if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) {
-                wake_up_ast(lp);
-                return;
-        }
-        /*
-         * A lock has been demoted to NL because it initially completed during
-         * BLOCK_LOCKS.  Now it must be requested in the originally requested
-         * mode.
-         */
-        if (test_and_clear_bit(LFL_REREQUEST, &lp->flags)) {
-                gdlm_assert(lp->req == DLM_LOCK_NL, "%x,%llx",
-                            lp->lockname.ln_type,
-                            (unsigned long long)lp->lockname.ln_number);
-                gdlm_assert(lp->prev_req > DLM_LOCK_NL, "%x,%llx",
-                            lp->lockname.ln_type,
-                            (unsigned long long)lp->lockname.ln_number);
-                lp->cur = DLM_LOCK_NL;
-                lp->req = lp->prev_req;
-                lp->prev_req = DLM_LOCK_IV;
-                lp->lkf &= ~DLM_LKF_CONVDEADLK;
-                set_bit(LFL_NOCACHE, &lp->flags);
-                if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
-                    !test_bit(LFL_NOBLOCK, &lp->flags))
-                        gdlm_queue_delayed(lp);
-                else
-                        queue_submit(lp);
-                return;
-        }
-        /*
-         * A request is granted during dlm recovery.  It may be granted
-         * because the locks of a failed node were cleared.  In that case,
-         * there may be inconsistent data beneath this lock and we must wait
-         * for recovery to complete to use it.  When gfs recovery is done this
-         * granted lock will be converted to NL and then reacquired in this
-         * granted state.
-         */
-        if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
-            !test_bit(LFL_NOBLOCK, &lp->flags) &&
-            lp->req != DLM_LOCK_NL) {
-                lp->cur = lp->req;
-                lp->prev_req = lp->req;
-                lp->req = DLM_LOCK_NL;
-                lp->lkf |= DLM_LKF_CONVERT;
-                lp->lkf &= ~DLM_LKF_CONVDEADLK;
-                log_debug("rereq %x,%llx id %x %d,%d",
-                          lp->lockname.ln_type,
-                          (unsigned long long)lp->lockname.ln_number,
-                          lp->lksb.sb_lkid, lp->cur, lp->req);
-                set_bit(LFL_REREQUEST, &lp->flags);
-                queue_submit(lp);
-                return;
-        }
-        /*
-         * DLM demoted the lock to NL before it was granted so GFS must be
-         * told it cannot cache data for this lock.
-         */
-        if (lp->lksb.sb_flags & DLM_SBF_DEMOTED)
-                set_bit(LFL_NOCACHE, &lp->flags);
-out:
-        /*
-         * This is an internal lock_dlm lock
-         */
-        if (test_bit(LFL_INLOCK, &lp->flags)) {
-                clear_bit(LFL_NOBLOCK, &lp->flags);
-                lp->cur = lp->req;
-                wake_up_ast(lp);
-                return;
-        }
-        /*
-         * Normal completion of a lock request.  Tell GFS it now has the lock.
-         */
-        clear_bit(LFL_NOBLOCK, &lp->flags);
-        lp->cur = lp->req;
-        acb.lc_name = lp->lockname;
-        acb.lc_ret |= gdlm_make_lmstate(lp->cur);
-        if (!test_and_clear_bit(LFL_NOCACHE, &lp->flags) &&
-            (lp->cur > DLM_LOCK_NL) && (prev_mode > DLM_LOCK_NL))
-                acb.lc_ret |= LM_OUT_CACHEABLE;
-        ls->fscb(ls->sdp, LM_CB_ASYNC, &acb);
-}
-static inline int no_work(struct gdlm_ls *ls, int blocking)
 {
        int ret;
        spin_lock(&ls->async_lock);
-        ret = list_empty(&ls->complete) && list_empty(&ls->submit);
+        ret = list_empty(&ls->submit);
-        if (ret && blocking)
-                ret = list_empty(&ls->blocking);
        spin_unlock(&ls->async_lock);
        return ret;
 }
-static inline int check_drop(struct gdlm_ls *ls)
+static int gdlm_thread(void *data)
-{
-        if (!ls->drop_locks_count)
-                return 0;
-        if (time_after(jiffies, ls->drop_time + ls->drop_locks_period * HZ)) {
-                ls->drop_time = jiffies;
-                if (ls->all_locks_count >= ls->drop_locks_count)
-                        return 1;
-        }
-        return 0;
-}
-static int gdlm_thread(void *data, int blist)
 {
        struct gdlm_ls *ls = (struct gdlm_ls *) data;
        struct gdlm_lock *lp = NULL;
-        uint8_t complete, blocking, submit, drop;
-        /* Only thread1 is allowed to do blocking callbacks since gfs
-           may wait for a completion callback within a blocking cb. */
        while (!kthread_should_stop()) {
                wait_event_interruptible(ls->thread_wait,
-                                !no_work(ls, blist) || kthread_should_stop());
+                                !no_work(ls) || kthread_should_stop());
-                complete = blocking = submit = drop = 0;
                spin_lock(&ls->async_lock);
-                if (blist && !list_empty(&ls->blocking)) {
+                if (!list_empty(&ls->submit)) {
-                        lp = list_entry(ls->blocking.next, struct gdlm_lock,
-                                        blist);
-                        list_del_init(&lp->blist);
-                        blocking = lp->bast_mode;
-                        lp->bast_mode = 0;
-                } else if (!list_empty(&ls->complete)) {
-                        lp = list_entry(ls->complete.next, struct gdlm_lock,
-                                        clist);
-                        list_del_init(&lp->clist);
-                        complete = 1;
-                } else if (!list_empty(&ls->submit)) {
                        lp = list_entry(ls->submit.next, struct gdlm_lock,
                                        delay_list);
                        list_del_init(&lp->delay_list);
-                        submit = 1;
+                        spin_unlock(&ls->async_lock);
+                        gdlm_do_lock(lp);
+                        spin_lock(&ls->async_lock);
                }
-                drop = check_drop(ls);
                spin_unlock(&ls->async_lock);
-                if (complete)
-                        process_complete(lp);
-                else if (blocking)
-                        process_blocking(lp, blocking);
-                else if (submit)
-                        gdlm_do_lock(lp);
-                if (drop)
-                        ls->fscb(ls->sdp, LM_CB_DROPLOCKS, NULL);
-                schedule();
        }
        return 0;
 }
-static int gdlm_thread1(void *data)
-{
-        return gdlm_thread(data, 1);
-}
-static int gdlm_thread2(void *data)
-{
-        return gdlm_thread(data, 0);
-}
 int gdlm_init_threads(struct gdlm_ls *ls)
 {
        struct task_struct *p;
        int error;
-        p = kthread_run(gdlm_thread1, ls, "lock_dlm1");
+        p = kthread_run(gdlm_thread, ls, "lock_dlm");
-        error = IS_ERR(p);
-        if (error) {
-                log_error("can't start lock_dlm1 thread %d", error);
-                return error;
-        }
-        ls->thread1 = p;
-        p = kthread_run(gdlm_thread2, ls, "lock_dlm2");
        error = IS_ERR(p);
        if (error) {
-                log_error("can't start lock_dlm2 thread %d", error);
+                log_error("can't start lock_dlm thread %d", error);
-                kthread_stop(ls->thread1);
                return error;
        }
-        ls->thread2 = p;
+        ls->thread = p;
        return 0;
 }
 void gdlm_release_threads(struct gdlm_ls *ls)
 {
-        kthread_stop(ls->thread1);
+        kthread_stop(ls->thread);
-        kthread_stop(ls->thread2);
 }
diff --git a/fs/gfs2/locking/nolock/Makefile b/fs/gfs2/locking/nolock/Makefile
deleted file mode 100644
index 35e9730bc3a8..000000000000
--- a/fs/gfs2/locking/nolock/Makefile
+++ /dev/null
@@ -1,3 +0,0 @@
-obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += lock_nolock.o
-lock_nolock-y := main.o
diff --git a/fs/gfs2/locking/nolock/main.c b/fs/gfs2/locking/nolock/main.c
deleted file mode 100644
index 284a5ece8d94..000000000000
--- a/fs/gfs2/locking/nolock/main.c
+++ /dev/null
@@ -1,238 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/fs.h>
-#include <linux/lm_interface.h>
-struct nolock_lockspace {
-        unsigned int nl_lvb_size;
-};
-static const struct lm_lockops nolock_ops;
-static int nolock_mount(char *table_name, char *host_data,
-                        lm_callback_t cb, void *cb_data,
-                        unsigned int min_lvb_size, int flags,
-                        struct lm_lockstruct *lockstruct,
-                        struct kobject *fskobj)
-{
-        char *c;
-        unsigned int jid;
-        struct nolock_lockspace *nl;
-        c = strstr(host_data, "jid=");
-        if (!c)
-                jid = 0;
-        else {
-                c += 4;
-                sscanf(c, "%u", &jid);
-        }
-        nl = kzalloc(sizeof(struct nolock_lockspace), GFP_KERNEL);
-        if (!nl)
-                return -ENOMEM;
-        nl->nl_lvb_size = min_lvb_size;
-        lockstruct->ls_jid = jid;
-        lockstruct->ls_first = 1;
-        lockstruct->ls_lvb_size = min_lvb_size;
-        lockstruct->ls_lockspace = nl;
-        lockstruct->ls_ops = &nolock_ops;
-        lockstruct->ls_flags = LM_LSFLAG_LOCAL;
-        return 0;
-}
-static void nolock_others_may_mount(void *lockspace)
-{
-}
-static void nolock_unmount(void *lockspace)
-{
-        struct nolock_lockspace *nl = lockspace;
-        kfree(nl);
-}
-static void nolock_withdraw(void *lockspace)
-{
-}
-/**
- * nolock_get_lock - get a lm_lock_t given a descripton of the lock
- * @lockspace: the lockspace the lock lives in
- * @name: the name of the lock
- * @lockp: return the lm_lock_t here
- *
- * Returns: 0 on success, -EXXX on failure
- */
-static int nolock_get_lock(void *lockspace, struct lm_lockname *name,
-                           void **lockp)
-{
-        *lockp = lockspace;
-        return 0;
-}
-/**
- * nolock_put_lock - get rid of a lock structure
- * @lock: the lock to throw away
- *
- */
-static void nolock_put_lock(void *lock)
-{
-}
-/**
- * nolock_lock - acquire a lock
- * @lock: the lock to manipulate
- * @cur_state: the current state
- * @req_state: the requested state
- * @flags: modifier flags
- *
- * Returns: A bitmap of LM_OUT_*
- */
-static unsigned int nolock_lock(void *lock, unsigned int cur_state,
-                                unsigned int req_state, unsigned int flags)
-{
-        return req_state | LM_OUT_CACHEABLE;
-}
-/**
- * nolock_unlock - unlock a lock
- * @lock: the lock to manipulate
- * @cur_state: the current state
- *
- * Returns: 0
- */
-static unsigned int nolock_unlock(void *lock, unsigned int cur_state)
-{
-        return 0;
-}
-static void nolock_cancel(void *lock)
-{
-}
-/**
- * nolock_hold_lvb - hold on to a lock value block
- * @lock: the lock the LVB is associated with
- * @lvbp: return the lm_lvb_t here
- *
- * Returns: 0 on success, -EXXX on failure
- */
-static int nolock_hold_lvb(void *lock, char **lvbp)
-{
-        struct nolock_lockspace *nl = lock;
-        int error = 0;
-        *lvbp = kzalloc(nl->nl_lvb_size, GFP_NOFS);
-        if (!*lvbp)
-                error = -ENOMEM;
-        return error;
-}
-/**
- * nolock_unhold_lvb - release a LVB
- * @lock: the lock the LVB is associated with
- * @lvb: the lock value block
- *
- */
-static void nolock_unhold_lvb(void *lock, char *lvb)
-{
-        kfree(lvb);
-}
-static int nolock_plock_get(void *lockspace, struct lm_lockname *name,
-                            struct file *file, struct file_lock *fl)
-{
-        posix_test_lock(file, fl);
-        return 0;
-}
-static int nolock_plock(void *lockspace, struct lm_lockname *name,
-                        struct file *file, int cmd, struct file_lock *fl)
-{
-        int error;
-        error = posix_lock_file_wait(file, fl);
-        return error;
-}
-static int nolock_punlock(void *lockspace, struct lm_lockname *name,
-                          struct file *file, struct file_lock *fl)
-{
-        int error;
-        error = posix_lock_file_wait(file, fl);
-        return error;
-}
-static void nolock_recovery_done(void *lockspace, unsigned int jid,
-                                 unsigned int message)
-{
-}
-static const struct lm_lockops nolock_ops = {
-        .lm_proto_name = "lock_nolock",
-        .lm_mount = nolock_mount,
-        .lm_others_may_mount = nolock_others_may_mount,
-        .lm_unmount = nolock_unmount,
-        .lm_withdraw = nolock_withdraw,
-        .lm_get_lock = nolock_get_lock,
-        .lm_put_lock = nolock_put_lock,
-        .lm_lock = nolock_lock,
-        .lm_unlock = nolock_unlock,
-        .lm_cancel = nolock_cancel,
-        .lm_hold_lvb = nolock_hold_lvb,
-        .lm_unhold_lvb = nolock_unhold_lvb,
-        .lm_plock_get = nolock_plock_get,
-        .lm_plock = nolock_plock,
-        .lm_punlock = nolock_punlock,
-        .lm_recovery_done = nolock_recovery_done,
-        .lm_owner = THIS_MODULE,
-};
-static int __init init_nolock(void)
-{
-        int error;
-        error = gfs2_register_lockproto(&nolock_ops);
-        if (error) {
-                printk(KERN_WARNING
-                       "lock_nolock: can't register protocol: %d\n", error);
-                return error;
-        }
-        printk(KERN_INFO
-               "Lock_Nolock (built %s %s) installed\n", __DATE__, __TIME__);
-        return 0;
-}
-static void __exit exit_nolock(void)
-{
-        gfs2_unregister_lockproto(&nolock_ops);
-}
-module_init(init_nolock);
-module_exit(exit_nolock);
-MODULE_DESCRIPTION("GFS Nolock Locking Module");
-MODULE_AUTHOR("Red Hat, Inc.");
-MODULE_LICENSE("GPL");
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 548264b1836d..6c6af9f5e3ab 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -87,6 +87,8 @@ void gfs2_remove_from_ail(struct gfs2_bufdata *bd)
 */
 static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+__releases(&sdp->sd_log_lock)
+__acquires(&sdp->sd_log_lock)
 {
        struct gfs2_bufdata *bd, *s;
        struct buffer_head *bh;
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 771152816508..7c64510ccfd2 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -21,6 +21,7 @@
 */
 static inline void gfs2_log_lock(struct gfs2_sbd *sdp)
+__acquires(&sdp->sd_log_lock)
 {
        spin_lock(&sdp->sd_log_lock);
 }
@@ -32,6 +33,7 @@ static inline void gfs2_log_lock(struct gfs2_sbd *sdp)
 */
 static inline void gfs2_log_unlock(struct gfs2_sbd *sdp)
+__releases(&sdp->sd_log_lock)
 {
        spin_unlock(&sdp->sd_log_lock);
 }
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 053e2ebbbd50..bcc668d0fadd 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -40,8 +40,6 @@ static void gfs2_init_glock_once(struct kmem_cache *cachep, void *foo)
        INIT_HLIST_NODE(&gl->gl_list);
        spin_lock_init(&gl->gl_spin);
        INIT_LIST_HEAD(&gl->gl_holders);
-        INIT_LIST_HEAD(&gl->gl_waiters1);
-        INIT_LIST_HEAD(&gl->gl_waiters3);
        gl->gl_lvb = NULL;
        atomic_set(&gl->gl_lvb_count, 0);
        INIT_LIST_HEAD(&gl->gl_reclaim);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 78d75f892f82..09853620c951 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -129,7 +129,7 @@ void gfs2_meta_sync(struct gfs2_glock *gl)
 }
 /**
- * getbuf - Get a buffer with a given address space
+ * gfs2_getbuf - Get a buffer with a given address space
 * @gl: the glock
 * @blkno: the block number (filesystem scope)
 * @create: 1 if the buffer should be created
@@ -137,7 +137,7 @@ void gfs2_meta_sync(struct gfs2_glock *gl)
 * Returns: the buffer
 */
-static struct buffer_head *getbuf(struct gfs2_glock *gl, u64 blkno, int create)
+struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
 {
        struct address_space *mapping = gl->gl_aspace->i_mapping;
        struct gfs2_sbd *sdp = gl->gl_sbd;
@@ -205,7 +205,7 @@ static void meta_prep_new(struct buffer_head *bh)
 struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
 {
        struct buffer_head *bh;
-        bh = getbuf(gl, blkno, CREATE);
+        bh = gfs2_getbuf(gl, blkno, CREATE);
        meta_prep_new(bh);
        return bh;
 }
@@ -223,7 +223,7 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
 int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
                   struct buffer_head **bhp)
 {
-        *bhp = getbuf(gl, blkno, CREATE);
+        *bhp = gfs2_getbuf(gl, blkno, CREATE);
        if (!buffer_uptodate(*bhp)) {
                ll_rw_block(READ_META, 1, bhp);
                if (flags & DIO_WAIT) {
@@ -346,7 +346,7 @@ void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen)
        struct buffer_head *bh;
        while (blen) {
-                bh = getbuf(ip->i_gl, bstart, NO_CREATE);
+                bh = gfs2_getbuf(ip->i_gl, bstart, NO_CREATE);
                if (bh) {
                        lock_buffer(bh);
                        gfs2_log_lock(sdp);
@@ -421,7 +421,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
        if (extlen > max_ra)
                extlen = max_ra;
-        first_bh = getbuf(gl, dblock, CREATE);
+        first_bh = gfs2_getbuf(gl, dblock, CREATE);
        if (buffer_uptodate(first_bh))
                goto out;
@@ -432,7 +432,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
        extlen--;
        while (extlen) {
-                bh = getbuf(gl, dblock, CREATE);
+                bh = gfs2_getbuf(gl, dblock, CREATE);
                if (!buffer_uptodate(bh) && !buffer_locked(bh))
                        ll_rw_block(READA, 1, &bh);
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index 73e3b1c76fe1..b1a5f3674d43 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -47,6 +47,7 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno);
 int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno,
                   int flags, struct buffer_head **bhp);
 int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
+struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create);
 void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
                         int meta);
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index f55394e57cb2..e64a1b04117a 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -499,34 +499,34 @@ static int __gfs2_readpage(void *file, struct page *page)
 * @file: The file to read
 * @page: The page of the file
 *
- * This deals with the locking required. We use a trylock in order to
+ * This deals with the locking required. We have to unlock and
- * avoid the page lock / glock ordering problems returning AOP_TRUNCATED_PAGE
+ * relock the page in order to get the locking in the right
- * in the event that we are unable to get the lock.
+ * order.
 */
 static int gfs2_readpage(struct file *file, struct page *page)
 {
-        struct gfs2_inode *ip = GFS2_I(page->mapping->host);
+        struct address_space *mapping = page->mapping;
-        struct gfs2_holder *gh;
+        struct gfs2_inode *ip = GFS2_I(mapping->host);
+        struct gfs2_holder gh;
        int error;
-        gh = gfs2_glock_is_locked_by_me(ip->i_gl);
+        unlock_page(page);
-        if (!gh) {
+        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
-                gh = kmalloc(sizeof(struct gfs2_holder), GFP_NOFS);
+        error = gfs2_glock_nq_atime(&gh);
-                if (!gh)
+        if (unlikely(error))
-                        return -ENOBUFS;
+                goto out;
-                gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, gh);
+        error = AOP_TRUNCATED_PAGE;
+        lock_page(page);
+        if (page->mapping == mapping && !PageUptodate(page))
+                error = __gfs2_readpage(file, page);
+        else
                unlock_page(page);
-                error = gfs2_glock_nq_atime(gh);
+        gfs2_glock_dq(&gh);
-                if (likely(error != 0))
-                        goto out;
-                return AOP_TRUNCATED_PAGE;
-        }
-        error = __gfs2_readpage(file, page);
-        gfs2_glock_dq(gh);
 out:
-        gfs2_holder_uninit(gh);
+        gfs2_holder_uninit(&gh);
-        kfree(gh);
+        if (error && error != AOP_TRUNCATED_PAGE)
+                lock_page(page);
        return error;
 }
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index e1b7d525a066..e9a366d4411c 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -15,6 +15,7 @@
 #include <linux/uio.h>
 #include <linux/blkdev.h>
 #include <linux/mm.h>
+#include <linux/mount.h>
 #include <linux/fs.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/ext2_fs.h>
@@ -62,11 +63,11 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin)
                error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
                                           &i_gh);
                if (!error) {
-                        error = remote_llseek(file, offset, origin);
+                        error = generic_file_llseek_unlocked(file, offset, origin);
                        gfs2_glock_dq_uninit(&i_gh);
                }
        } else
-                error = remote_llseek(file, offset, origin);
+                error = generic_file_llseek_unlocked(file, offset, origin);
        return error;
 }
@@ -133,7 +134,6 @@ static const u32 fsflags_to_gfs2[32] = {
        [7] = GFS2_DIF_NOATIME,
        [12] = GFS2_DIF_EXHASH,
        [14] = GFS2_DIF_INHERIT_JDATA,
-        [20] = GFS2_DIF_INHERIT_DIRECTIO,
 };
 static const u32 gfs2_to_fsflags[32] = {
@@ -142,7 +142,6 @@ static const u32 gfs2_to_fsflags[32] = {
        [gfs2fl_AppendOnly] = FS_APPEND_FL,
        [gfs2fl_NoAtime] = FS_NOATIME_FL,
        [gfs2fl_ExHash] = FS_INDEX_FL,
-        [gfs2fl_InheritDirectio] = FS_DIRECTIO_FL,
        [gfs2fl_InheritJdata] = FS_JOURNAL_DATA_FL,
 };
@@ -160,12 +159,8 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
                return error;
        fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_di.di_flags);
-        if (!S_ISDIR(inode->i_mode)) {
+        if (!S_ISDIR(inode->i_mode) && ip->i_di.di_flags & GFS2_DIF_JDATA)
-                if (ip->i_di.di_flags & GFS2_DIF_JDATA)
+                fsflags |= FS_JOURNAL_DATA_FL;
-                        fsflags |= FS_JOURNAL_DATA_FL;
-                if (ip->i_di.di_flags & GFS2_DIF_DIRECTIO)
-                        fsflags |= FS_DIRECTIO_FL;
-        }
        if (put_user(fsflags, ptr))
                error = -EFAULT;
@@ -194,13 +189,11 @@ void gfs2_set_inode_flags(struct inode *inode)
 /* Flags that can be set by user space */
 #define GFS2_FLAGS_USER_SET (GFS2_DIF_JDATA|                    \
-                             GFS2_DIF_DIRECTIO|                 \
                             GFS2_DIF_IMMUTABLE|                \
                             GFS2_DIF_APPENDONLY|               \
                             GFS2_DIF_NOATIME|                  \
                             GFS2_DIF_SYNC|                     \
                             GFS2_DIF_SYSTEM|                   \
-                             GFS2_DIF_INHERIT_DIRECTIO|         \
                             GFS2_DIF_INHERIT_JDATA)
 /**
@@ -220,10 +213,14 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
        int error;
        u32 new_flags, flags;
-        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+        error = mnt_want_write(filp->f_path.mnt);
        if (error)
                return error;
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+        if (error)
+                goto out_drop_write;
        flags = ip->i_di.di_flags;
        new_flags = (flags & ~mask) | (reqflags & mask);
        if ((new_flags ^ flags) == 0)
@@ -242,7 +239,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
            !capable(CAP_LINUX_IMMUTABLE))
                goto out;
        if (!IS_IMMUTABLE(inode)) {
-                error = permission(inode, MAY_WRITE, NULL);
+                error = gfs2_permission(inode, MAY_WRITE);
                if (error)
                        goto out;
        }
@@ -272,6 +269,8 @@ out_trans_end:
        gfs2_trans_end(sdp);
 out:
        gfs2_glock_dq_uninit(&gh);
+out_drop_write:
+        mnt_drop_write(filp->f_path.mnt);
        return error;
 }
@@ -285,8 +284,6 @@ static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
        if (!S_ISDIR(inode->i_mode)) {
                if (gfsflags & GFS2_DIF_INHERIT_JDATA)
                        gfsflags ^= (GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA);
-                if (gfsflags & GFS2_DIF_INHERIT_DIRECTIO)
-                        gfsflags ^= (GFS2_DIF_DIRECTIO | GFS2_DIF_INHERIT_DIRECTIO);
                return do_gfs2_set_flags(filp, gfsflags, ~0);
        }
        return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_JDATA);
@@ -487,11 +484,6 @@ static int gfs2_open(struct inode *inode, struct file *file)
                        goto fail_gunlock;
                }
-                /* Listen to the Direct I/O flag */
-                if (ip->i_di.di_flags & GFS2_DIF_DIRECTIO)
-                        file->f_flags |= O_DIRECT;
                gfs2_glock_dq_uninit(&i_gh);
        }
@@ -669,8 +661,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
        int error = 0;
        state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
-        flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE 
+        flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE;
-                | GL_FLOCK;
        mutex_lock(&fp->f_fl_mutex);
@@ -683,9 +674,8 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
                gfs2_glock_dq_wait(fl_gh);
                gfs2_holder_reinit(state, flags, fl_gh);
        } else {
-                error = gfs2_glock_get(GFS2_SB(&ip->i_inode),
+                error = gfs2_glock_get(GFS2_SB(&ip->i_inode), ip->i_no_addr,
-                                      ip->i_no_addr, &gfs2_flock_glops,
+                                       &gfs2_flock_glops, CREATE, &gl);
-                                      CREATE, &gl);
                if (error)
                        goto out;
                gfs2_holder_init(gl, state, flags, fl_gh);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index b2028c82e8d1..b4d1d6490633 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -64,7 +64,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
        mutex_init(&sdp->sd_rindex_mutex);
        INIT_LIST_HEAD(&sdp->sd_rindex_list);
        INIT_LIST_HEAD(&sdp->sd_rindex_mru_list);
-        INIT_LIST_HEAD(&sdp->sd_rindex_recent_list);
        INIT_LIST_HEAD(&sdp->sd_jindex_list);
        spin_lock_init(&sdp->sd_jindex_spin);
@@ -364,6 +363,8 @@ static int map_journal_extents(struct gfs2_sbd *sdp)
 static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
 {
+        if (!sdp->sd_lockstruct.ls_ops->lm_others_may_mount)
+                return;
        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
                sdp->sd_lockstruct.ls_ops->lm_others_may_mount(
                                        sdp->sd_lockstruct.ls_lockspace);
@@ -741,8 +742,7 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
                goto out;
        }
-        if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lockspace) ||
+        if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) ||
-            gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) ||
            gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >=
                                  GFS2_MIN_LVB_SIZE)) {
                gfs2_unmount_lockproto(&sdp->sd_lockstruct);
@@ -873,7 +873,7 @@ fail_sb:
 fail_locking:
        init_locking(sdp, &mount_gh, UNDO);
 fail_lm:
-        gfs2_gl_hash_clear(sdp, WAIT);
+        gfs2_gl_hash_clear(sdp);
        gfs2_lm_unmount(sdp);
        while (invalidate_inodes(sb))
                yield();
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 2686ad4c0029..1e252dfc5294 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -163,7 +163,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
        if (error)
                goto out;
-        error = permission(dir, MAY_WRITE | MAY_EXEC, NULL);
+        error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC);
        if (error)
                goto out_gunlock;
@@ -669,7 +669,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
                        }
                }
        } else {
-                error = permission(ndir, MAY_WRITE | MAY_EXEC, NULL);
+                error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC);
                if (error)
                        goto out_gunlock;
@@ -704,7 +704,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
        /* Check out the dir to be renamed */
        if (dir_rename) {
-                error = permission(odentry->d_inode, MAY_WRITE, NULL);
+                error = gfs2_permission(odentry->d_inode, MAY_WRITE);
                if (error)
                        goto out_gunlock;
        }
@@ -891,7 +891,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
 * Returns: errno
 */
-static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
+int gfs2_permission(struct inode *inode, int mask)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_holder i_gh;
@@ -905,13 +905,22 @@ static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
                unlock = 1;
        }
-        error = generic_permission(inode, mask, gfs2_check_acl);
+        if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
+                error = -EACCES;
+        else
+                error = generic_permission(inode, mask, gfs2_check_acl);
        if (unlock)
                gfs2_glock_dq_uninit(&i_gh);
        return error;
 }
+static int gfs2_iop_permission(struct inode *inode, int mask,
+                               struct nameidata *nd)
+{
+        return gfs2_permission(inode, mask);
+}
 static int setattr_size(struct inode *inode, struct iattr *attr)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
@@ -1141,7 +1150,7 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
 }
 const struct inode_operations gfs2_file_iops = {
-        .permission = gfs2_permission,
+        .permission = gfs2_iop_permission,
        .setattr = gfs2_setattr,
        .getattr = gfs2_getattr,
        .setxattr = gfs2_setxattr,
@@ -1160,7 +1169,7 @@ const struct inode_operations gfs2_dir_iops = {
        .rmdir = gfs2_rmdir,
        .mknod = gfs2_mknod,
        .rename = gfs2_rename,
-        .permission = gfs2_permission,
+        .permission = gfs2_iop_permission,
        .setattr = gfs2_setattr,
        .getattr = gfs2_getattr,
        .setxattr = gfs2_setxattr,
@@ -1172,7 +1181,7 @@ const struct inode_operations gfs2_dir_iops = {
 const struct inode_operations gfs2_symlink_iops = {
        .readlink = gfs2_readlink,
        .follow_link = gfs2_follow_link,
-        .permission = gfs2_permission,
+        .permission = gfs2_iop_permission,
        .setattr = gfs2_setattr,
        .getattr = gfs2_getattr,
        .setxattr = gfs2_setxattr,
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 0b7cc920eb89..f66ea0f7a356 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -126,7 +126,7 @@ static void gfs2_put_super(struct super_block *sb)
        gfs2_clear_rgrpd(sdp);
        gfs2_jindex_free(sdp);
        /*  Take apart glock structures and buffer lists  */
-        gfs2_gl_hash_clear(sdp, WAIT);
+        gfs2_gl_hash_clear(sdp);
        /*  Unmount the locking protocol  */
        gfs2_lm_unmount(sdp);
@@ -155,7 +155,7 @@ static void gfs2_write_super(struct super_block *sb)
 static int gfs2_sync_fs(struct super_block *sb, int wait)
 {
        sb->s_dirt = 0;
-        if (wait)
+        if (wait && sb->s_fs_info)
                gfs2_log_flush(sb->s_fs_info, NULL);
        return 0;
 }
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 56aaf915c59a..3e073f5144fa 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -904,7 +904,7 @@ static int need_sync(struct gfs2_quota_data *qd)
                do_sync = 0;
        else {
                value *= gfs2_jindex_size(sdp) * num;
-                do_div(value, den);
+                value = div_s64(value, den);
                value += (s64)be64_to_cpu(qd->qd_qb.qb_value);
                if (value < (s64)be64_to_cpu(qd->qd_qb.qb_limit))
                        do_sync = 0;
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 2888e4b4b1c5..d5e91f4f6a0b 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -428,6 +428,9 @@ static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *hea
 static void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
                                  unsigned int message)
 {
+        if (!sdp->sd_lockstruct.ls_ops->lm_recovery_done)
+                return;
        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
                sdp->sd_lockstruct.ls_ops->lm_recovery_done(
                        sdp->sd_lockstruct.ls_lockspace, jid, message);
@@ -505,7 +508,7 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd)
                error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
                                           LM_FLAG_NOEXP | LM_FLAG_PRIORITY |
-                                           GL_NOCANCEL | GL_NOCACHE, &t_gh);
+                                           GL_NOCACHE, &t_gh);
                if (error)
                        goto fail_gunlock_ji;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 3401628d742b..2d90fb253505 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -371,11 +371,6 @@ static void clear_rgrpdi(struct gfs2_sbd *sdp)
        spin_lock(&sdp->sd_rindex_spin);
        sdp->sd_rindex_forward = NULL;
-        head = &sdp->sd_rindex_recent_list;
-        while (!list_empty(head)) {
-                rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent);
-                list_del(&rgd->rd_recent);
-        }
        spin_unlock(&sdp->sd_rindex_spin);
        head = &sdp->sd_rindex_list;
@@ -945,107 +940,30 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked)
 }
 /**
- * recent_rgrp_first - get first RG from "recent" list
- * @sdp: The GFS2 superblock
- * @rglast: address of the rgrp used last
- *
- * Returns: The first rgrp in the recent list
- */
-static struct gfs2_rgrpd *recent_rgrp_first(struct gfs2_sbd *sdp,
-                                            u64 rglast)
-{
-        struct gfs2_rgrpd *rgd;
-        spin_lock(&sdp->sd_rindex_spin);
-        if (rglast) {
-                list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
-                        if (rgrp_contains_block(rgd, rglast))
-                                goto out;
-                }
-        }
-        rgd = NULL;
-        if (!list_empty(&sdp->sd_rindex_recent_list))
-                rgd = list_entry(sdp->sd_rindex_recent_list.next,
-                                 struct gfs2_rgrpd, rd_recent);
-out:
-        spin_unlock(&sdp->sd_rindex_spin);
-        return rgd;
-}
-/**
 * recent_rgrp_next - get next RG from "recent" list
 * @cur_rgd: current rgrp
- * @remove:
 *
 * Returns: The next rgrp in the recent list
 */
-static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd,
+static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd)
-                                           int remove)
 {
        struct gfs2_sbd *sdp = cur_rgd->rd_sbd;
        struct list_head *head;
        struct gfs2_rgrpd *rgd;
        spin_lock(&sdp->sd_rindex_spin);
+        head = &sdp->sd_rindex_mru_list;
-        head = &sdp->sd_rindex_recent_list;
+        if (unlikely(cur_rgd->rd_list_mru.next == head)) {
+                spin_unlock(&sdp->sd_rindex_spin);
-        list_for_each_entry(rgd, head, rd_recent) {
+                return NULL;
-                if (rgd == cur_rgd) {
-                        if (cur_rgd->rd_recent.next != head)
-                                rgd = list_entry(cur_rgd->rd_recent.next,
-                                                 struct gfs2_rgrpd, rd_recent);
-                        else
-                                rgd = NULL;
-                        if (remove)
-                                list_del(&cur_rgd->rd_recent);
-                        goto out;
-                }
        }
+        rgd = list_entry(cur_rgd->rd_list_mru.next, struct gfs2_rgrpd, rd_list_mru);
-        rgd = NULL;
-        if (!list_empty(head))
-                rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent);
-out:
        spin_unlock(&sdp->sd_rindex_spin);
        return rgd;
 }
 /**
- * recent_rgrp_add - add an RG to tail of "recent" list
- * @new_rgd: The rgrp to add
- *
- */
-static void recent_rgrp_add(struct gfs2_rgrpd *new_rgd)
-{
-        struct gfs2_sbd *sdp = new_rgd->rd_sbd;
-        struct gfs2_rgrpd *rgd;
-        unsigned int count = 0;
-        unsigned int max = sdp->sd_rgrps / gfs2_jindex_size(sdp);
-        spin_lock(&sdp->sd_rindex_spin);
-        list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
-                if (rgd == new_rgd)
-                        goto out;
-                if (++count >= max)
-                        goto out;
-        }
-        list_add_tail(&new_rgd->rd_recent, &sdp->sd_rindex_recent_list);
-out:
-        spin_unlock(&sdp->sd_rindex_spin);
-}
-/**
 * forward_rgrp_get - get an rgrp to try next from full list
 * @sdp: The GFS2 superblock
 *
@@ -1112,9 +1030,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
        int loops = 0;
        int error, rg_locked;
-        /* Try recently successful rgrps */
+        rgd = gfs2_blk2rgrpd(sdp, ip->i_goal);
-        rgd = recent_rgrp_first(sdp, ip->i_goal);
        while (rgd) {
                rg_locked = 0;
@@ -1136,11 +1052,9 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
                                gfs2_glock_dq_uninit(&al->al_rgd_gh);
                        if (inode)
                                return inode;
-                        rgd = recent_rgrp_next(rgd, 1);
+                        /* fall through */
-                        break;
                case GLR_TRYFAILED:
-                        rgd = recent_rgrp_next(rgd, 0);
+                        rgd = recent_rgrp_next(rgd);
                        break;
                default:
@@ -1199,7 +1113,9 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
 out:
        if (begin) {
-                recent_rgrp_add(rgd);
+                spin_lock(&sdp->sd_rindex_spin);
+                list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
+                spin_unlock(&sdp->sd_rindex_spin);
                rgd = gfs2_rgrpd_get_next(rgd);
                if (!rgd)
                        rgd = gfs2_rgrpd_get_first(sdp);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 7aeacbc65f35..63a8a902d9db 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -65,7 +65,6 @@ void gfs2_tune_init(struct gfs2_tune *gt)
        gt->gt_quota_quantum = 60;
        gt->gt_atime_quantum = 3600;
        gt->gt_new_files_jdata = 0;
-        gt->gt_new_files_directio = 0;
        gt->gt_max_readahead = 1 << 18;
        gt->gt_stall_secs = 600;
        gt->gt_complain_secs = 10;
@@ -941,8 +940,7 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp,
        }
        error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_DEFERRED,
-                               LM_FLAG_PRIORITY | GL_NOCACHE,
+                                   GL_NOCACHE, t_gh);
-                               t_gh);
        list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
                error = gfs2_jdesc_check(jd);
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 9ab9fc85ecd0..74846559fc3f 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -110,18 +110,6 @@ static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf,
        return len;
 }
-static ssize_t shrink_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
-{
-        if (!capable(CAP_SYS_ADMIN))
-                return -EACCES;
-        if (simple_strtol(buf, NULL, 0) != 1)
-                return -EINVAL;
-        gfs2_gl_hash_clear(sdp, NO_WAIT);
-        return len;
-}
 static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,
                                size_t len)
 {
@@ -175,7 +163,6 @@ static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store)
 GFS2_ATTR(id,                  0444, id_show,       NULL);
 GFS2_ATTR(fsname,              0444, fsname_show,   NULL);
 GFS2_ATTR(freeze,              0644, freeze_show,   freeze_store);
-GFS2_ATTR(shrink,              0200, NULL,          shrink_store);
 GFS2_ATTR(withdraw,            0644, withdraw_show, withdraw_store);
 GFS2_ATTR(statfs_sync,         0200, NULL,          statfs_sync_store);
 GFS2_ATTR(quota_sync,          0200, NULL,          quota_sync_store);
@@ -186,7 +173,6 @@ static struct attribute *gfs2_attrs[] = {
        &gfs2_attr_id.attr,
        &gfs2_attr_fsname.attr,
        &gfs2_attr_freeze.attr,
-        &gfs2_attr_shrink.attr,
        &gfs2_attr_withdraw.attr,
        &gfs2_attr_statfs_sync.attr,
        &gfs2_attr_quota_sync.attr,
@@ -426,7 +412,6 @@ TUNE_ATTR(max_readahead, 0);
 TUNE_ATTR(complain_secs, 0);
 TUNE_ATTR(statfs_slow, 0);
 TUNE_ATTR(new_files_jdata, 0);
-TUNE_ATTR(new_files_directio, 0);
 TUNE_ATTR(quota_simul_sync, 1);
 TUNE_ATTR(quota_cache_secs, 1);
 TUNE_ATTR(stall_secs, 1);
@@ -455,7 +440,6 @@ static struct attribute *tune_attrs[] = {
        &tune_attr_quotad_secs.attr,
        &tune_attr_quota_scale.attr,
        &tune_attr_new_files_jdata.attr,
-        &tune_attr_new_files_directio.attr,
        NULL,
 };
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 6914598022ce..91389c8aee8a 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -688,7 +688,6 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
        J_ASSERT(transaction->t_state == T_FINISHED);
        J_ASSERT(transaction->t_buffers == NULL);
-        J_ASSERT(transaction->t_sync_datalist == NULL);
        J_ASSERT(transaction->t_forget == NULL);
        J_ASSERT(transaction->t_iobuf_list == NULL);
        J_ASSERT(transaction->t_shadow_list == NULL);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index a2ed72f7ceee..f8b3be873226 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -22,6 +22,8 @@
 #include <linux/pagemap.h>
 #include <linux/jiffies.h>
 #include <linux/crc32.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
 /*
 * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -37,8 +39,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 }
 /*
- * When an ext3-ordered file is truncated, it is possible that many pages are
+ * When an ext4 file is truncated, it is possible that some pages are not
- * not sucessfully freed, because they are attached to a committing transaction.
+ * successfully freed, because they are attached to a committing transaction.
 * After the transaction commits, these pages are left on the LRU, with no
 * ->mapping, and with attached buffers.  These pages are trivially reclaimable
 * by the VM, but their apparent absence upsets the VM accounting, and it makes
@@ -80,21 +82,6 @@ nope:
 }
 /*
- * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
- * held.  For ranking reasons we must trylock.  If we lose, schedule away and
- * return 0.  j_list_lock is dropped in this case.
- */
-static int inverted_lock(journal_t *journal, struct buffer_head *bh)
-{
-        if (!jbd_trylock_bh_state(bh)) {
-                spin_unlock(&journal->j_list_lock);
-                schedule();
-                return 0;
-        }
-        return 1;
-}
-/*
 * Done it all: now submit the commit record.  We should have
 * cleaned up our previous buffers by now, so if we are in abort
 * mode we can now just skip the rest of the journal write
@@ -112,6 +99,7 @@ static int journal_submit_commit_record(journal_t *journal,
        struct buffer_head *bh;
        int ret;
        int barrier_done = 0;
+        struct timespec now = current_kernel_time();
        if (is_journal_aborted(journal))
                return 0;
@@ -126,6 +114,8 @@ static int journal_submit_commit_record(journal_t *journal,
        tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
        tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
        tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
+        tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
+        tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
        if (JBD2_HAS_COMPAT_FEATURE(journal,
                                    JBD2_FEATURE_COMPAT_CHECKSUM)) {
@@ -197,159 +187,104 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
 }
 /*
- * Wait for all submitted IO to complete.
+ * write the filemap data using writepage() address_space_operations.
+ * We don't do block allocation here even for delalloc. We don't
+ * use writepages() because with dealyed allocation we may be doing
+ * block allocation in writepages().
 */
-static int journal_wait_on_locked_list(journal_t *journal,
+static int journal_submit_inode_data_buffers(struct address_space *mapping)
-                                       transaction_t *commit_transaction)
 {
-        int ret = 0;
+        int ret;
-        struct journal_head *jh;
+        struct writeback_control wbc = {
+                .sync_mode =  WB_SYNC_ALL,
-        while (commit_transaction->t_locked_list) {
+                .nr_to_write = mapping->nrpages * 2,
-                struct buffer_head *bh;
+                .range_start = 0,
+                .range_end = i_size_read(mapping->host),
-                jh = commit_transaction->t_locked_list->b_tprev;
+                .for_writepages = 1,
-                bh = jh2bh(jh);
+        };
-                get_bh(bh);
-                if (buffer_locked(bh)) {
+        ret = generic_writepages(mapping, &wbc);
-                        spin_unlock(&journal->j_list_lock);
-                        wait_on_buffer(bh);
-                        if (unlikely(!buffer_uptodate(bh)))
-                                ret = -EIO;
-                        spin_lock(&journal->j_list_lock);
-                }
-                if (!inverted_lock(journal, bh)) {
-                        put_bh(bh);
-                        spin_lock(&journal->j_list_lock);
-                        continue;
-                }
-                if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
-                        __jbd2_journal_unfile_buffer(jh);
-                        jbd_unlock_bh_state(bh);
-                        jbd2_journal_remove_journal_head(bh);
-                        put_bh(bh);
-                } else {
-                        jbd_unlock_bh_state(bh);
-                }
-                put_bh(bh);
-                cond_resched_lock(&journal->j_list_lock);
-        }
        return ret;
-  }
+}
-static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
+/*
+ * Submit all the data buffers of inode associated with the transaction to
+ * disk.
+ *
+ * We are in a committing transaction. Therefore no new inode can be added to
+ * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
+ * operate on from being released while we write out pages.
+ */
+static int journal_submit_data_buffers(journal_t *journal,
+                transaction_t *commit_transaction)
 {
-        int i;
+        struct jbd2_inode *jinode;
+        int err, ret = 0;
+        struct address_space *mapping;
-        for (i = 0; i < bufs; i++) {
+        spin_lock(&journal->j_list_lock);
-                wbuf[i]->b_end_io = end_buffer_write_sync;
+        list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
-                /* We use-up our safety reference in submit_bh() */
+                mapping = jinode->i_vfs_inode->i_mapping;
-                submit_bh(WRITE, wbuf[i]);
+                jinode->i_flags |= JI_COMMIT_RUNNING;
+                spin_unlock(&journal->j_list_lock);
+                /*
+                 * submit the inode data buffers. We use writepage
+                 * instead of writepages. Because writepages can do
+                 * block allocation  with delalloc. We need to write
+                 * only allocated blocks here.
+                 */
+                err = journal_submit_inode_data_buffers(mapping);
+                if (!ret)
+                        ret = err;
+                spin_lock(&journal->j_list_lock);
+                J_ASSERT(jinode->i_transaction == commit_transaction);
+                jinode->i_flags &= ~JI_COMMIT_RUNNING;
+                wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
        }
+        spin_unlock(&journal->j_list_lock);
+        return ret;
 }
 /*
- *  Submit all the data buffers to disk
+ * Wait for data submitted for writeout, refile inodes to proper
+ * transaction if needed.
+ *
 */
-static void journal_submit_data_buffers(journal_t *journal,
+static int journal_finish_inode_data_buffers(journal_t *journal,
-                                transaction_t *commit_transaction)
+                transaction_t *commit_transaction)
 {
-        struct journal_head *jh;
+        struct jbd2_inode *jinode, *next_i;
-        struct buffer_head *bh;
+        int err, ret = 0;
-        int locked;
-        int bufs = 0;
-        struct buffer_head **wbuf = journal->j_wbuf;
-        /*
+        /* For locking, see the comment in journal_submit_data_buffers() */
-         * Whenever we unlock the journal and sleep, things can get added
-         * onto ->t_sync_datalist, so we have to keep looping back to
-         * write_out_data until we *know* that the list is empty.
-         *
-         * Cleanup any flushed data buffers from the data list.  Even in
-         * abort mode, we want to flush this out as soon as possible.
-         */
-write_out_data:
-        cond_resched();
        spin_lock(&journal->j_list_lock);
+        list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+                jinode->i_flags |= JI_COMMIT_RUNNING;
+                spin_unlock(&journal->j_list_lock);
+                err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
+                if (!ret)
+                        ret = err;
+                spin_lock(&journal->j_list_lock);
+                jinode->i_flags &= ~JI_COMMIT_RUNNING;
+                wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
+        }
-        while (commit_transaction->t_sync_datalist) {
+        /* Now refile inode to proper lists */
-                jh = commit_transaction->t_sync_datalist;
+        list_for_each_entry_safe(jinode, next_i,
-                bh = jh2bh(jh);
+                                 &commit_transaction->t_inode_list, i_list) {
-                locked = 0;
+                list_del(&jinode->i_list);
+                if (jinode->i_next_transaction) {
-                /* Get reference just to make sure buffer does not disappear
+                        jinode->i_transaction = jinode->i_next_transaction;
-                 * when we are forced to drop various locks */
+                        jinode->i_next_transaction = NULL;
-                get_bh(bh);
+                        list_add(&jinode->i_list,
-                /* If the buffer is dirty, we need to submit IO and hence
+                                &jinode->i_transaction->t_inode_list);
-                 * we need the buffer lock. We try to lock the buffer without
-                 * blocking. If we fail, we need to drop j_list_lock and do
-                 * blocking lock_buffer().
-                 */
-                if (buffer_dirty(bh)) {
-                        if (test_set_buffer_locked(bh)) {
-                                BUFFER_TRACE(bh, "needs blocking lock");
-                                spin_unlock(&journal->j_list_lock);
-                                /* Write out all data to prevent deadlocks */
-                                journal_do_submit_data(wbuf, bufs);
-                                bufs = 0;
-                                lock_buffer(bh);
-                                spin_lock(&journal->j_list_lock);
-                        }
-                        locked = 1;
-                }
-                /* We have to get bh_state lock. Again out of order, sigh. */
-                if (!inverted_lock(journal, bh)) {
-                        jbd_lock_bh_state(bh);
-                        spin_lock(&journal->j_list_lock);
-                }
-                /* Someone already cleaned up the buffer? */
-                if (!buffer_jbd(bh)
-                        || jh->b_transaction != commit_transaction
-                        || jh->b_jlist != BJ_SyncData) {
-                        jbd_unlock_bh_state(bh);
-                        if (locked)
-                                unlock_buffer(bh);
-                        BUFFER_TRACE(bh, "already cleaned up");
-                        put_bh(bh);
-                        continue;
-                }
-                if (locked && test_clear_buffer_dirty(bh)) {
-                        BUFFER_TRACE(bh, "needs writeout, adding to array");
-                        wbuf[bufs++] = bh;
-                        __jbd2_journal_file_buffer(jh, commit_transaction,
-                                                BJ_Locked);
-                        jbd_unlock_bh_state(bh);
-                        if (bufs == journal->j_wbufsize) {
-                                spin_unlock(&journal->j_list_lock);
-                                journal_do_submit_data(wbuf, bufs);
-                                bufs = 0;
-                                goto write_out_data;
-                        }
-                } else if (!locked && buffer_locked(bh)) {
-                        __jbd2_journal_file_buffer(jh, commit_transaction,
-                                                BJ_Locked);
-                        jbd_unlock_bh_state(bh);
-                        put_bh(bh);
                } else {
-                        BUFFER_TRACE(bh, "writeout complete: unfile");
+                        jinode->i_transaction = NULL;
-                        __jbd2_journal_unfile_buffer(jh);
-                        jbd_unlock_bh_state(bh);
-                        if (locked)
-                                unlock_buffer(bh);
-                        jbd2_journal_remove_journal_head(bh);
-                        /* Once for our safety reference, once for
-                         * jbd2_journal_remove_journal_head() */
-                        put_bh(bh);
-                        put_bh(bh);
-                }
-                if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
-                        spin_unlock(&journal->j_list_lock);
-                        goto write_out_data;
                }
        }
        spin_unlock(&journal->j_list_lock);
-        journal_do_submit_data(wbuf, bufs);
+        return ret;
 }
 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
@@ -524,21 +459,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
         * Now start flushing things to disk, in the order they appear
         * on the transaction lists.  Data blocks go first.
         */
-        err = 0;
+        err = journal_submit_data_buffers(journal, commit_transaction);
-        journal_submit_data_buffers(journal, commit_transaction);
-        /*
-         * Wait for all previously submitted IO to complete if commit
-         * record is to be written synchronously.
-         */
-        spin_lock(&journal->j_list_lock);
-        if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
-                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
-                err = journal_wait_on_locked_list(journal,
-                                                commit_transaction);
-        spin_unlock(&journal->j_list_lock);
        if (err)
                jbd2_journal_abort(journal, err);
@@ -547,16 +468,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        jbd_debug(3, "JBD: commit phase 2\n");
        /*
-         * If we found any dirty or locked buffers, then we should have
-         * looped back up to the write_out_data label.  If there weren't
-         * any then journal_clean_data_list should have wiped the list
-         * clean by now, so check that it is in fact empty.
-         */
-        J_ASSERT (commit_transaction->t_sync_datalist == NULL);
-        jbd_debug (3, "JBD: commit phase 3\n");
-        /*
         * Way to go: we have now written out all of the data for a
         * transaction!  Now comes the tricky part: we need to write out
         * metadata.  Loop over the transaction's entire buffer list:
@@ -574,6 +485,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        J_ASSERT(commit_transaction->t_nr_buffers <=
                 commit_transaction->t_outstanding_credits);
+        err = 0;
        descriptor = NULL;
        bufs = 0;
        while (commit_transaction->t_buffers) {
@@ -748,15 +660,19 @@ start_journal_io:
                                                 &cbh, crc32_sum);
                if (err)
                        __jbd2_journal_abort_hard(journal);
-                spin_lock(&journal->j_list_lock);
-                err = journal_wait_on_locked_list(journal,
-                                                commit_transaction);
-                spin_unlock(&journal->j_list_lock);
-                if (err)
-                        __jbd2_journal_abort_hard(journal);
        }
+        /*
+         * This is the right place to wait for data buffers both for ASYNC
+         * and !ASYNC commit. If commit is ASYNC, we need to wait only after
+         * the commit block went to disk (which happens above). If commit is
+         * SYNC, we need to wait for data buffers before we start writing
+         * commit block, which happens below in such setting.
+         */
+        err = journal_finish_inode_data_buffers(journal, commit_transaction);
+        if (err)
+                jbd2_journal_abort(journal, err);
        /* Lo and behold: we have just managed to send a transaction to
           the log.  Before we can commit it, wait for the IO so far to
           complete.  Control buffers being written are on the
@@ -768,7 +684,7 @@ start_journal_io:
           so we incur less scheduling load.
        */
-        jbd_debug(3, "JBD: commit phase 4\n");
+        jbd_debug(3, "JBD: commit phase 3\n");
        /*
         * akpm: these are BJ_IO, and j_list_lock is not needed.
@@ -827,7 +743,7 @@ wait_for_iobuf:
        J_ASSERT (commit_transaction->t_shadow_list == NULL);
-        jbd_debug(3, "JBD: commit phase 5\n");
+        jbd_debug(3, "JBD: commit phase 4\n");
        /* Here we wait for the revoke record and descriptor record buffers */
 wait_for_ctlbuf:
@@ -854,7 +770,7 @@ wait_for_iobuf:
                /* AKPM: bforget here */
        }
-        jbd_debug(3, "JBD: commit phase 6\n");
+        jbd_debug(3, "JBD: commit phase 5\n");
        if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
@@ -874,9 +790,9 @@ wait_for_iobuf:
           transaction can be removed from any checkpoint list it was on
           before. */
-        jbd_debug(3, "JBD: commit phase 7\n");
+        jbd_debug(3, "JBD: commit phase 6\n");
-        J_ASSERT(commit_transaction->t_sync_datalist == NULL);
+        J_ASSERT(list_empty(&commit_transaction->t_inode_list));
        J_ASSERT(commit_transaction->t_buffers == NULL);
        J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
        J_ASSERT(commit_transaction->t_iobuf_list == NULL);
@@ -997,7 +913,7 @@ restart_loop:
        /* Done with this transaction! */
-        jbd_debug(3, "JBD: commit phase 8\n");
+        jbd_debug(3, "JBD: commit phase 7\n");
        J_ASSERT(commit_transaction->t_state == T_COMMIT);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 2e24567c4a79..b26c6d9fe6ae 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -50,7 +50,6 @@ EXPORT_SYMBOL(jbd2_journal_unlock_updates);
 EXPORT_SYMBOL(jbd2_journal_get_write_access);
 EXPORT_SYMBOL(jbd2_journal_get_create_access);
 EXPORT_SYMBOL(jbd2_journal_get_undo_access);
-EXPORT_SYMBOL(jbd2_journal_dirty_data);
 EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
 EXPORT_SYMBOL(jbd2_journal_release_buffer);
 EXPORT_SYMBOL(jbd2_journal_forget);
@@ -82,6 +81,10 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
 EXPORT_SYMBOL(jbd2_journal_invalidatepage);
 EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
 EXPORT_SYMBOL(jbd2_journal_force_commit);
+EXPORT_SYMBOL(jbd2_journal_file_inode);
+EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
+EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
+EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
 static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
 static void __journal_abort_soft (journal_t *journal, int errno);
@@ -2195,6 +2198,54 @@ void jbd2_journal_put_journal_head(struct journal_head *jh)
 }
 /*
+ * Initialize jbd inode head
+ */
+void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode)
+{
+        jinode->i_transaction = NULL;
+        jinode->i_next_transaction = NULL;
+        jinode->i_vfs_inode = inode;
+        jinode->i_flags = 0;
+        INIT_LIST_HEAD(&jinode->i_list);
+}
+/*
+ * Function to be called before we start removing inode from memory (i.e.,
+ * clear_inode() is a fine place to be called from). It removes inode from
+ * transaction's lists.
+ */
+void jbd2_journal_release_jbd_inode(journal_t *journal,
+                                    struct jbd2_inode *jinode)
+{
+        int writeout = 0;
+        if (!journal)
+                return;
+restart:
+        spin_lock(&journal->j_list_lock);
+        /* Is commit writing out inode - we have to wait */
+        if (jinode->i_flags & JI_COMMIT_RUNNING) {
+                wait_queue_head_t *wq;
+                DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
+                wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
+                prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+                spin_unlock(&journal->j_list_lock);
+                schedule();
+                finish_wait(wq, &wait.wait);
+                goto restart;
+        }
+        /* Do we need to wait for data writeback? */
+        if (journal->j_committing_transaction == jinode->i_transaction)
+                writeout = 1;
+        if (jinode->i_transaction) {
+                list_del(&jinode->i_list);
+                jinode->i_transaction = NULL;
+        }
+        spin_unlock(&journal->j_list_lock);
+}
+/*
 * debugfs tunables
 */
 #ifdef CONFIG_JBD2_DEBUG
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index d6e006e67804..4f7cadbb19fa 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -41,7 +41,6 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
 *      new transaction and we can't block without protecting against other
 *      processes trying to touch the journal while it is in transition.
 *
- * Called under j_state_lock
 */
 static transaction_t *
@@ -52,6 +51,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
        transaction->t_tid = journal->j_transaction_sequence++;
        transaction->t_expires = jiffies + journal->j_commit_interval;
        spin_lock_init(&transaction->t_handle_lock);
+        INIT_LIST_HEAD(&transaction->t_inode_list);
        /* Set up the commit timer for the new transaction. */
        journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
@@ -943,183 +943,6 @@ out:
 }
 /**
- * int jbd2_journal_dirty_data() -  mark a buffer as containing dirty data which
- *                             needs to be flushed before we can commit the
- *                             current transaction.
- * @handle: transaction
- * @bh: bufferhead to mark
- *
- * The buffer is placed on the transaction's data list and is marked as
- * belonging to the transaction.
- *
- * Returns error number or 0 on success.
- *
- * jbd2_journal_dirty_data() can be called via page_launder->ext3_writepage
- * by kswapd.
- */
-int jbd2_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
-{
-        journal_t *journal = handle->h_transaction->t_journal;
-        int need_brelse = 0;
-        struct journal_head *jh;
-        if (is_handle_aborted(handle))
-                return 0;
-        jh = jbd2_journal_add_journal_head(bh);
-        JBUFFER_TRACE(jh, "entry");
-        /*
-         * The buffer could *already* be dirty.  Writeout can start
-         * at any time.
-         */
-        jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid);
-        /*
-         * What if the buffer is already part of a running transaction?
-         *
-         * There are two cases:
-         * 1) It is part of the current running transaction.  Refile it,
-         *    just in case we have allocated it as metadata, deallocated
-         *    it, then reallocated it as data.
-         * 2) It is part of the previous, still-committing transaction.
-         *    If all we want to do is to guarantee that the buffer will be
-         *    written to disk before this new transaction commits, then
-         *    being sure that the *previous* transaction has this same
-         *    property is sufficient for us!  Just leave it on its old
-         *    transaction.
-         *
-         * In case (2), the buffer must not already exist as metadata
-         * --- that would violate write ordering (a transaction is free
-         * to write its data at any point, even before the previous
-         * committing transaction has committed).  The caller must
-         * never, ever allow this to happen: there's nothing we can do
-         * about it in this layer.
-         */
-        jbd_lock_bh_state(bh);
-        spin_lock(&journal->j_list_lock);
-        /* Now that we have bh_state locked, are we really still mapped? */
-        if (!buffer_mapped(bh)) {
-                JBUFFER_TRACE(jh, "unmapped buffer, bailing out");
-                goto no_journal;
-        }
-        if (jh->b_transaction) {
-                JBUFFER_TRACE(jh, "has transaction");
-                if (jh->b_transaction != handle->h_transaction) {
-                        JBUFFER_TRACE(jh, "belongs to older transaction");
-                        J_ASSERT_JH(jh, jh->b_transaction ==
-                                        journal->j_committing_transaction);
-                        /* @@@ IS THIS TRUE  ? */
-                        /*
-                         * Not any more.  Scenario: someone does a write()
-                         * in data=journal mode.  The buffer's transaction has
-                         * moved into commit.  Then someone does another
-                         * write() to the file.  We do the frozen data copyout
-                         * and set b_next_transaction to point to j_running_t.
-                         * And while we're in that state, someone does a
-                         * writepage() in an attempt to pageout the same area
-                         * of the file via a shared mapping.  At present that
-                         * calls jbd2_journal_dirty_data(), and we get right here.
-                         * It may be too late to journal the data.  Simply
-                         * falling through to the next test will suffice: the
-                         * data will be dirty and wil be checkpointed.  The
-                         * ordering comments in the next comment block still
-                         * apply.
-                         */
-                        //J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
-                        /*
-                         * If we're journalling data, and this buffer was
-                         * subject to a write(), it could be metadata, forget
-                         * or shadow against the committing transaction.  Now,
-                         * someone has dirtied the same darn page via a mapping
-                         * and it is being writepage()'d.
-                         * We *could* just steal the page from commit, with some
-                         * fancy locking there.  Instead, we just skip it -
-                         * don't tie the page's buffers to the new transaction
-                         * at all.
-                         * Implication: if we crash before the writepage() data
-                         * is written into the filesystem, recovery will replay
-                         * the write() data.
-                         */
-                        if (jh->b_jlist != BJ_None &&
-                                        jh->b_jlist != BJ_SyncData &&
-                                        jh->b_jlist != BJ_Locked) {
-                                JBUFFER_TRACE(jh, "Not stealing");
-                                goto no_journal;
-                        }
-                        /*
-                         * This buffer may be undergoing writeout in commit.  We
-                         * can't return from here and let the caller dirty it
-                         * again because that can cause the write-out loop in
-                         * commit to never terminate.
-                         */
-                        if (buffer_dirty(bh)) {
-                                get_bh(bh);
-                                spin_unlock(&journal->j_list_lock);
-                                jbd_unlock_bh_state(bh);
-                                need_brelse = 1;
-                                sync_dirty_buffer(bh);
-                                jbd_lock_bh_state(bh);
-                                spin_lock(&journal->j_list_lock);
-                                /* Since we dropped the lock... */
-                                if (!buffer_mapped(bh)) {
-                                        JBUFFER_TRACE(jh, "buffer got unmapped");
-                                        goto no_journal;
-                                }
-                                /* The buffer may become locked again at any
-                                   time if it is redirtied */
-                        }
-                        /* journal_clean_data_list() may have got there first */
-                        if (jh->b_transaction != NULL) {
-                                JBUFFER_TRACE(jh, "unfile from commit");
-                                __jbd2_journal_temp_unlink_buffer(jh);
-                                /* It still points to the committing
-                                 * transaction; move it to this one so
-                                 * that the refile assert checks are
-                                 * happy. */
-                                jh->b_transaction = handle->h_transaction;
-                        }
-                        /* The buffer will be refiled below */
-                }
-                /*
-                 * Special case --- the buffer might actually have been
-                 * allocated and then immediately deallocated in the previous,
-                 * committing transaction, so might still be left on that
-                 * transaction's metadata lists.
-                 */
-                if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
-                        JBUFFER_TRACE(jh, "not on correct data list: unfile");
-                        J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
-                        __jbd2_journal_temp_unlink_buffer(jh);
-                        jh->b_transaction = handle->h_transaction;
-                        JBUFFER_TRACE(jh, "file as data");
-                        __jbd2_journal_file_buffer(jh, handle->h_transaction,
-                                                BJ_SyncData);
-                }
-        } else {
-                JBUFFER_TRACE(jh, "not on a transaction");
-                __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_SyncData);
-        }
-no_journal:
-        spin_unlock(&journal->j_list_lock);
-        jbd_unlock_bh_state(bh);
-        if (need_brelse) {
-                BUFFER_TRACE(bh, "brelse");
-                __brelse(bh);
-        }
-        JBUFFER_TRACE(jh, "exit");
-        jbd2_journal_put_journal_head(jh);
-        return 0;
-}
-/**
 * int jbd2_journal_dirty_metadata() -  mark a buffer as containing dirty metadata
 * @handle: transaction to add buffer to.
 * @bh: buffer to mark
@@ -1541,10 +1364,10 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
 * Remove a buffer from the appropriate transaction list.
 *
 * Note that this function can *change* the value of
- * bh->b_transaction->t_sync_datalist, t_buffers, t_forget,
+ * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list,
- * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list.  If the caller
+ * t_log_list or t_reserved_list.  If the caller is holding onto a copy of one
- * is holding onto a copy of one of thee pointers, it could go bad.
+ * of these pointers, it could go bad.  Generally the caller needs to re-read
- * Generally the caller needs to re-read the pointer from the transaction_t.
+ * the pointer from the transaction_t.
 *
 * Called under j_list_lock.  The journal may not be locked.
 */
@@ -1566,9 +1389,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
        switch (jh->b_jlist) {
        case BJ_None:
                return;
-        case BJ_SyncData:
-                list = &transaction->t_sync_datalist;
-                break;
        case BJ_Metadata:
                transaction->t_nr_buffers--;
                J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
@@ -1589,9 +1409,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
        case BJ_Reserved:
                list = &transaction->t_reserved_list;
                break;
-        case BJ_Locked:
-                list = &transaction->t_locked_list;
-                break;
        }
        __blist_del_buffer(list, jh);
@@ -1634,15 +1451,7 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
                goto out;
        spin_lock(&journal->j_list_lock);
-        if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) {
+        if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
-                if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
-                        /* A written-back ordered data buffer */
-                        JBUFFER_TRACE(jh, "release data");
-                        __jbd2_journal_unfile_buffer(jh);
-                        jbd2_journal_remove_journal_head(bh);
-                        __brelse(bh);
-                }
-        } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
                /* written-back checkpointed metadata buffer */
                if (jh->b_jlist == BJ_None) {
                        JBUFFER_TRACE(jh, "remove from checkpoint list");
@@ -1656,12 +1465,43 @@ out:
        return;
 }
+/*
+ * jbd2_journal_try_to_free_buffers() could race with
+ * jbd2_journal_commit_transaction(). The later might still hold the
+ * reference count to the buffers when inspecting them on
+ * t_syncdata_list or t_locked_list.
+ *
+ * jbd2_journal_try_to_free_buffers() will call this function to
+ * wait for the current transaction to finish syncing data buffers, before
+ * try to free that buffer.
+ *
+ * Called with journal->j_state_lock hold.
+ */
+static void jbd2_journal_wait_for_transaction_sync_data(journal_t *journal)
+{
+        transaction_t *transaction;
+        tid_t tid;
+        spin_lock(&journal->j_state_lock);
+        transaction = journal->j_committing_transaction;
+        if (!transaction) {
+                spin_unlock(&journal->j_state_lock);
+                return;
+        }
+        tid = transaction->t_tid;
+        spin_unlock(&journal->j_state_lock);
+        jbd2_log_wait_commit(journal, tid);
+}
 /**
 * int jbd2_journal_try_to_free_buffers() - try to free page buffers.
 * @journal: journal for operation
 * @page: to try and free
- * @unused_gfp_mask: unused
+ * @gfp_mask: we use the mask to detect how hard should we try to release
+ * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
+ * release the buffers.
 *
 *
 * For all the buffers on this page,
@@ -1690,9 +1530,11 @@ out:
 * journal_try_to_free_buffer() is changing its state.  But that
 * cannot happen because we never reallocate freed data as metadata
 * while the data is part of a transaction.  Yes?
+ *
+ * Return 0 on failure, 1 on success
 */
 int jbd2_journal_try_to_free_buffers(journal_t *journal,
-                                struct page *page, gfp_t unused_gfp_mask)
+                                struct page *page, gfp_t gfp_mask)
 {
        struct buffer_head *head;
        struct buffer_head *bh;
@@ -1708,7 +1550,8 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
                /*
                 * We take our own ref against the journal_head here to avoid
                 * having to add tons of locking around each instance of
-                 * jbd2_journal_remove_journal_head() and jbd2_journal_put_journal_head().
+                 * jbd2_journal_remove_journal_head() and
+                 * jbd2_journal_put_journal_head().
                 */
                jh = jbd2_journal_grab_journal_head(bh);
                if (!jh)
@@ -1721,7 +1564,28 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
                if (buffer_jbd(bh))
                        goto busy;
        } while ((bh = bh->b_this_page) != head);
        ret = try_to_free_buffers(page);
+        /*
+         * There are a number of places where jbd2_journal_try_to_free_buffers()
+         * could race with jbd2_journal_commit_transaction(), the later still
+         * holds the reference to the buffers to free while processing them.
+         * try_to_free_buffers() failed to free those buffers. Some of the
+         * caller of releasepage() request page buffers to be dropped, otherwise
+         * treat the fail-to-free as errors (such as generic_file_direct_IO())
+         *
+         * So, if the caller of try_to_release_page() wants the synchronous
+         * behaviour(i.e make sure buffers are dropped upon return),
+         * let's wait for the current transaction to finish flush of
+         * dirty data buffers, then try to free those buffers again,
+         * with the journal locked.
+         */
+        if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) {
+                jbd2_journal_wait_for_transaction_sync_data(journal);
+                ret = try_to_free_buffers(page);
+        }
 busy:
        return ret;
 }
@@ -1823,6 +1687,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
        if (!buffer_jbd(bh))
                goto zap_buffer_unlocked;
+        /* OK, we have data buffer in journaled mode */
        spin_lock(&journal->j_state_lock);
        jbd_lock_bh_state(bh);
        spin_lock(&journal->j_list_lock);
@@ -1886,15 +1751,6 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
                }
        } else if (transaction == journal->j_committing_transaction) {
                JBUFFER_TRACE(jh, "on committing transaction");
-                if (jh->b_jlist == BJ_Locked) {
-                        /*
-                         * The buffer is on the committing transaction's locked
-                         * list.  We have the buffer locked, so I/O has
-                         * completed.  So we can nail the buffer now.
-                         */
-                        may_free = __dispose_buffer(jh, transaction);
-                        goto zap_buffer;
-                }
                /*
                 * If it is committing, we simply cannot touch it.  We
                 * can remove it's next_transaction pointer from the
@@ -2027,9 +1883,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
                J_ASSERT_JH(jh, !jh->b_committed_data);
                J_ASSERT_JH(jh, !jh->b_frozen_data);
                return;
-        case BJ_SyncData:
-                list = &transaction->t_sync_datalist;
-                break;
        case BJ_Metadata:
                transaction->t_nr_buffers++;
                list = &transaction->t_buffers;
@@ -2049,9 +1902,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
        case BJ_Reserved:
                list = &transaction->t_reserved_list;
                break;
-        case BJ_Locked:
-                list =  &transaction->t_locked_list;
-                break;
        }
        __blist_add_buffer(list, jh);
@@ -2141,3 +1991,88 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
        spin_unlock(&journal->j_list_lock);
        __brelse(bh);
 }
+/*
+ * File inode in the inode list of the handle's transaction
+ */
+int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
+{
+        transaction_t *transaction = handle->h_transaction;
+        journal_t *journal = transaction->t_journal;
+        if (is_handle_aborted(handle))
+                return -EIO;
+        jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
+                        transaction->t_tid);
+        /*
+         * First check whether inode isn't already on the transaction's
+         * lists without taking the lock. Note that this check is safe
+         * without the lock as we cannot race with somebody removing inode
+         * from the transaction. The reason is that we remove inode from the
+         * transaction only in journal_release_jbd_inode() and when we commit
+         * the transaction. We are guarded from the first case by holding
+         * a reference to the inode. We are safe against the second case
+         * because if jinode->i_transaction == transaction, commit code
+         * cannot touch the transaction because we hold reference to it,
+         * and if jinode->i_next_transaction == transaction, commit code
+         * will only file the inode where we want it.
+         */
+        if (jinode->i_transaction == transaction ||
+            jinode->i_next_transaction == transaction)
+                return 0;
+        spin_lock(&journal->j_list_lock);
+        if (jinode->i_transaction == transaction ||
+            jinode->i_next_transaction == transaction)
+                goto done;
+        /* On some different transaction's list - should be
+         * the committing one */
+        if (jinode->i_transaction) {
+                J_ASSERT(jinode->i_next_transaction == NULL);
+                J_ASSERT(jinode->i_transaction ==
+                                        journal->j_committing_transaction);
+                jinode->i_next_transaction = transaction;
+                goto done;
+        }
+        /* Not on any transaction list... */
+        J_ASSERT(!jinode->i_next_transaction);
+        jinode->i_transaction = transaction;
+        list_add(&jinode->i_list, &transaction->t_inode_list);
+done:
+        spin_unlock(&journal->j_list_lock);
+        return 0;
+}
+/*
+ * This function must be called when inode is journaled in ordered mode
+ * before truncation happens. It starts writeout of truncated part in
+ * case it is in the committing transaction so that we stand to ordered
+ * mode consistency guarantees.
+ */
+int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
+                                        loff_t new_size)
+{
+        journal_t *journal;
+        transaction_t *commit_trans;
+        int ret = 0;
+        if (!inode->i_transaction && !inode->i_next_transaction)
+                goto out;
+        journal = inode->i_transaction->t_journal;
+        spin_lock(&journal->j_state_lock);
+        commit_trans = journal->j_committing_transaction;
+        spin_unlock(&journal->j_state_lock);
+        if (inode->i_transaction == commit_trans) {
+                ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping,
+                        new_size, LLONG_MAX);
+                if (ret)
+                        jbd2_journal_abort(journal, ret);
+        }
+out:
+        return ret;
+}
diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c
index bf6ab19b86ee..6a73de84bcef 100644
--- a/fs/jfs/jfs_debug.c
+++ b/fs/jfs/jfs_debug.c
@@ -21,6 +21,7 @@
 #include <linux/ctype.h>
 #include <linux/module.h>
 #include <linux/proc_fs.h>
+#include <linux/seq_file.h>
 #include <asm/uaccess.h>
 #include "jfs_incore.h"
 #include "jfs_filsys.h"
@@ -30,29 +31,19 @@
 static struct proc_dir_entry *base;
 #ifdef CONFIG_JFS_DEBUG
-static int loglevel_read(char *page, char **start, off_t off,
+static int jfs_loglevel_proc_show(struct seq_file *m, void *v)
-                         int count, int *eof, void *data)
 {
-        int len;
+        seq_printf(m, "%d\n", jfsloglevel);
+        return 0;
-        len = sprintf(page, "%d\n", jfsloglevel);
+}
-        len -= off;
-        *start = page + off;
-        if (len > count)
-                len = count;
-        else
-                *eof = 1;
-        if (len < 0)
-                len = 0;
-        return len;
+static int jfs_loglevel_proc_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, jfs_loglevel_proc_show, NULL);
 }
-static int loglevel_write(struct file *file, const char __user *buffer,
+static ssize_t jfs_loglevel_proc_write(struct file *file,
-                        unsigned long count, void *data)
+                const char __user *buffer, size_t count, loff_t *ppos)
 {
        char c;
@@ -65,22 +56,30 @@ static int loglevel_write(struct file *file, const char __user *buffer,
        jfsloglevel = c - '0';
        return count;
 }
+static const struct file_operations jfs_loglevel_proc_fops = {
+        .owner          = THIS_MODULE,
+        .open           = jfs_loglevel_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+        .write          = jfs_loglevel_proc_write,
+};
 #endif
 static struct {
        const char      *name;
-        read_proc_t     *read_fn;
+        const struct file_operations *proc_fops;
-        write_proc_t    *write_fn;
 } Entries[] = {
 #ifdef CONFIG_JFS_STATISTICS
-        { "lmstats",    jfs_lmstats_read, },
+        { "lmstats",    &jfs_lmstats_proc_fops, },
-        { "txstats",    jfs_txstats_read, },
+        { "txstats",    &jfs_txstats_proc_fops, },
-        { "xtstat",     jfs_xtstat_read, },
+        { "xtstat",     &jfs_xtstat_proc_fops, },
-        { "mpstat",     jfs_mpstat_read, },
+        { "mpstat",     &jfs_mpstat_proc_fops, },
 #endif
 #ifdef CONFIG_JFS_DEBUG
-        { "TxAnchor",   jfs_txanchor_read, },
+        { "TxAnchor",   &jfs_txanchor_proc_fops, },
-        { "loglevel",   loglevel_read, loglevel_write }
+        { "loglevel",   &jfs_loglevel_proc_fops }
 #endif
 };
 #define NPROCENT        ARRAY_SIZE(Entries)
@@ -93,13 +92,8 @@ void jfs_proc_init(void)
                return;
        base->owner = THIS_MODULE;
-        for (i = 0; i < NPROCENT; i++) {
+        for (i = 0; i < NPROCENT; i++)
-                struct proc_dir_entry *p;
+                proc_create(Entries[i].name, 0, base, Entries[i].proc_fops);
-                if ((p = create_proc_entry(Entries[i].name, 0, base))) {
-                        p->read_proc = Entries[i].read_fn;
-                        p->write_proc = Entries[i].write_fn;
-                }
-        }
 }
 void jfs_proc_clean(void)
diff --git a/fs/jfs/jfs_debug.h b/fs/jfs/jfs_debug.h
index 044c1e654cc0..eafd1300a00b 100644
--- a/fs/jfs/jfs_debug.h
+++ b/fs/jfs/jfs_debug.h
@@ -62,7 +62,7 @@ extern void jfs_proc_clean(void);
 extern int jfsloglevel;
-extern int jfs_txanchor_read(char *, char **, off_t, int, int *, void *);
+extern const struct file_operations jfs_txanchor_proc_fops;
 /* information message: e.g., configuration, major event */
 #define jfs_info(fmt, arg...) do {                      \
@@ -105,10 +105,10 @@ extern int jfs_txanchor_read(char *, char **, off_t, int, int *, void *);
 *      ----------
 */
 #ifdef  CONFIG_JFS_STATISTICS
-extern int jfs_lmstats_read(char *, char **, off_t, int, int *, void *);
+extern const struct file_operations jfs_lmstats_proc_fops;
-extern int jfs_txstats_read(char *, char **, off_t, int, int *, void *);
+extern const struct file_operations jfs_txstats_proc_fops;
-extern int jfs_mpstat_read(char *, char **, off_t, int, int *, void *);
+extern const struct file_operations jfs_mpstat_proc_fops;
-extern int jfs_xtstat_read(char *, char **, off_t, int, int *, void *);
+extern const struct file_operations jfs_xtstat_proc_fops;
 #define INCREMENT(x)            ((x)++)
 #define DECREMENT(x)            ((x)--)
diff --git a/fs/jfs/jfs_dtree.h b/fs/jfs/jfs_dtree.h
index cdac2d5bafeb..2545bb317235 100644
--- a/fs/jfs/jfs_dtree.h
+++ b/fs/jfs/jfs_dtree.h
@@ -243,9 +243,6 @@ typedef union {
 #define JFS_REMOVE 3
 #define JFS_RENAME 4
-#define DIRENTSIZ(namlen) \
-    ( (sizeof(struct dirent) - 2*(JFS_NAME_MAX+1) + 2*((namlen)+1) + 3) &~ 3 )
 /*
 * Maximum file offset for directories.
 */
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 734ec916beaf..d6363d8309d0 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -1520,7 +1520,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip)
                                        jfs_error(ip->i_sb,
                                                  "diAlloc: can't find free bit "
                                                  "in wmap");
-                                        return EIO;
+                                        return -EIO;
                                }
                                /* determine the inode number within the
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 325a9679b95a..cd2ec2988b59 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -69,6 +69,7 @@
 #include <linux/freezer.h>
 #include <linux/delay.h>
 #include <linux/mutex.h>
+#include <linux/seq_file.h>
 #include "jfs_incore.h"
 #include "jfs_filsys.h"
 #include "jfs_metapage.h"
@@ -2503,13 +2504,9 @@ exit:
 }
 #ifdef CONFIG_JFS_STATISTICS
-int jfs_lmstats_read(char *buffer, char **start, off_t offset, int length,
+static int jfs_lmstats_proc_show(struct seq_file *m, void *v)
-                      int *eof, void *data)
 {
-        int len = 0;
+        seq_printf(m,
-        off_t begin;
-        len += sprintf(buffer,
                       "JFS Logmgr stats\n"
                       "================\n"
                       "commits = %d\n"
@@ -2522,19 +2519,19 @@ int jfs_lmstats_read(char *buffer, char **start, off_t offset, int length,
                       lmStat.pagedone,
                       lmStat.full_page,
                       lmStat.partial_page);
+        return 0;
+}
-        begin = offset;
+static int jfs_lmstats_proc_open(struct inode *inode, struct file *file)
-        *start = buffer + begin;
+{
-        len -= begin;
+        return single_open(file, jfs_lmstats_proc_show, NULL);
-        if (len > length)
-                len = length;
-        else
-                *eof = 1;
-        if (len < 0)
-                len = 0;
-        return len;
 }
+const struct file_operations jfs_lmstats_proc_fops = {
+        .owner          = THIS_MODULE,
+        .open           = jfs_lmstats_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
 #endif /* CONFIG_JFS_STATISTICS */
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index d1e64f2f2fcd..854ff0ec574f 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -19,10 +19,12 @@
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/module.h>
 #include <linux/bio.h>
 #include <linux/init.h>
 #include <linux/buffer_head.h>
 #include <linux/mempool.h>
+#include <linux/seq_file.h>
 #include "jfs_incore.h"
 #include "jfs_superblock.h"
 #include "jfs_filsys.h"
@@ -804,13 +806,9 @@ void __invalidate_metapages(struct inode *ip, s64 addr, int len)
 }
 #ifdef CONFIG_JFS_STATISTICS
-int jfs_mpstat_read(char *buffer, char **start, off_t offset, int length,
+static int jfs_mpstat_proc_show(struct seq_file *m, void *v)
-                    int *eof, void *data)
 {
-        int len = 0;
+        seq_printf(m,
-        off_t begin;
-        len += sprintf(buffer,
                       "JFS Metapage statistics\n"
                       "=======================\n"
                       "page allocations = %d\n"
@@ -819,19 +817,19 @@ int jfs_mpstat_read(char *buffer, char **start, off_t offset, int length,
                       mpStat.pagealloc,
                       mpStat.pagefree,
                       mpStat.lockwait);
+        return 0;
+}
-        begin = offset;
+static int jfs_mpstat_proc_open(struct inode *inode, struct file *file)
-        *start = buffer + begin;
+{
-        len -= begin;
+        return single_open(file, jfs_mpstat_proc_show, NULL);
-        if (len > length)
-                len = length;
-        else
-                *eof = 1;
-        if (len < 0)
-                len = 0;
-        return len;
 }
+const struct file_operations jfs_mpstat_proc_fops = {
+        .owner          = THIS_MODULE,
+        .open           = jfs_mpstat_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
 #endif
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index e7c60ae6b5b2..f26e4d03ada5 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -49,6 +49,7 @@
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/kthread.h>
+#include <linux/seq_file.h>
 #include "jfs_incore.h"
 #include "jfs_inode.h"
 #include "jfs_filsys.h"
@@ -3009,11 +3010,8 @@ int jfs_sync(void *arg)
 }
 #if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_DEBUG)
-int jfs_txanchor_read(char *buffer, char **start, off_t offset, int length,
+static int jfs_txanchor_proc_show(struct seq_file *m, void *v)
-                      int *eof, void *data)
 {
-        int len = 0;
-        off_t begin;
        char *freewait;
        char *freelockwait;
        char *lowlockwait;
@@ -3025,7 +3023,7 @@ int jfs_txanchor_read(char *buffer, char **start, off_t offset, int length,
        lowlockwait =
            waitqueue_active(&TxAnchor.lowlockwait) ? "active" : "empty";
-        len += sprintf(buffer,
+        seq_printf(m,
                       "JFS TxAnchor\n"
                       "============\n"
                       "freetid = %d\n"
@@ -3044,31 +3042,27 @@ int jfs_txanchor_read(char *buffer, char **start, off_t offset, int length,
                       TxAnchor.tlocksInUse,
                       jfs_tlocks_low,
                       list_empty(&TxAnchor.unlock_queue) ? "" : "not ");
+        return 0;
+}
-        begin = offset;
+static int jfs_txanchor_proc_open(struct inode *inode, struct file *file)
-        *start = buffer + begin;
+{
-        len -= begin;
+        return single_open(file, jfs_txanchor_proc_show, NULL);
-        if (len > length)
-                len = length;
-        else
-                *eof = 1;
-        if (len < 0)
-                len = 0;
-        return len;
 }
+const struct file_operations jfs_txanchor_proc_fops = {
+        .owner          = THIS_MODULE,
+        .open           = jfs_txanchor_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
 #endif
 #if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_STATISTICS)
-int jfs_txstats_read(char *buffer, char **start, off_t offset, int length,
+static int jfs_txstats_proc_show(struct seq_file *m, void *v)
-                     int *eof, void *data)
 {
-        int len = 0;
+        seq_printf(m,
-        off_t begin;
-        len += sprintf(buffer,
                       "JFS TxStats\n"
                       "===========\n"
                       "calls to txBegin = %d\n"
@@ -3089,19 +3083,19 @@ int jfs_txstats_read(char *buffer, char **start, off_t offset, int length,
                       TxStat.txBeginAnon_lockslow,
                       TxStat.txLockAlloc,
                       TxStat.txLockAlloc_freelock);
+        return 0;
+}
-        begin = offset;
+static int jfs_txstats_proc_open(struct inode *inode, struct file *file)
-        *start = buffer + begin;
+{
-        len -= begin;
+        return single_open(file, jfs_txstats_proc_show, NULL);
-        if (len > length)
-                len = length;
-        else
-                *eof = 1;
-        if (len < 0)
-                len = 0;
-        return len;
 }
+const struct file_operations jfs_txstats_proc_fops = {
+        .owner          = THIS_MODULE,
+        .open           = jfs_txstats_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
 #endif
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index 5a61ebf2cbcc..ae3acafb447b 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -20,7 +20,9 @@
 */
 #include <linux/fs.h>
+#include <linux/module.h>
 #include <linux/quotaops.h>
+#include <linux/seq_file.h>
 #include "jfs_incore.h"
 #include "jfs_filsys.h"
 #include "jfs_metapage.h"
@@ -4134,13 +4136,9 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
 }
 #ifdef CONFIG_JFS_STATISTICS
-int jfs_xtstat_read(char *buffer, char **start, off_t offset, int length,
+static int jfs_xtstat_proc_show(struct seq_file *m, void *v)
-                    int *eof, void *data)
 {
-        int len = 0;
+        seq_printf(m,
-        off_t begin;
-        len += sprintf(buffer,
                       "JFS Xtree statistics\n"
                       "====================\n"
                       "searches = %d\n"
@@ -4149,19 +4147,19 @@ int jfs_xtstat_read(char *buffer, char **start, off_t offset, int length,
                       xtStat.search,
                       xtStat.fastSearch,
                       xtStat.split);
+        return 0;
+}
-        begin = offset;
+static int jfs_xtstat_proc_open(struct inode *inode, struct file *file)
-        *start = buffer + begin;
+{
-        len -= begin;
+        return single_open(file, jfs_xtstat_proc_show, NULL);
-        if (len > length)
-                len = length;
-        else
-                *eof = 1;
-        if (len < 0)
-                len = 0;
-        return len;
 }
+const struct file_operations jfs_xtstat_proc_fops = {
+        .owner          = THIS_MODULE,
+        .open           = jfs_xtstat_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
 #endif
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 0ba6778edaa2..2aba82386810 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1455,7 +1455,7 @@ static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struc
                free_UCSname(&key);
                if (rc == -ENOENT) {
                        d_add(dentry, NULL);
-                        return ERR_PTR(0);
+                        return NULL;
                } else if (rc) {
                        jfs_err("jfs_lookup: dtSearch returned %d", rc);
                        return ERR_PTR(rc);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 50ea65451732..0288e6d7936a 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -499,7 +499,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
        inode = jfs_iget(sb, ROOT_I);
        if (IS_ERR(inode)) {
                ret = PTR_ERR(inode);
-                goto out_no_root;
+                goto out_no_rw;
        }
        sb->s_root = d_alloc_root(inode);
        if (!sb->s_root)
@@ -521,9 +521,8 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
        return 0;
 out_no_root:
-        jfs_err("jfs_read_super: get root inode failed");
+        jfs_err("jfs_read_super: get root dentry failed");
-        if (inode)
+        iput(inode);
-                iput(inode);
 out_no_rw:
        rc = jfs_umount(sb);
diff --git a/fs/mpage.c b/fs/mpage.c
index 235e4d3873a8..dbcc7af76a15 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -82,7 +82,7 @@ static void mpage_end_io_write(struct bio *bio, int err)
        bio_put(bio);
 }
-static struct bio *mpage_bio_submit(int rw, struct bio *bio)
+struct bio *mpage_bio_submit(int rw, struct bio *bio)
 {
        bio->bi_end_io = mpage_end_io_read;
        if (rw == WRITE)
@@ -90,6 +90,7 @@ static struct bio *mpage_bio_submit(int rw, struct bio *bio)
        submit_bio(rw, bio);
        return NULL;
 }
+EXPORT_SYMBOL(mpage_bio_submit);
 static struct bio *
 mpage_alloc(struct block_device *bdev,
@@ -435,15 +436,9 @@ EXPORT_SYMBOL(mpage_readpage);
 * written, so it can intelligently allocate a suitably-sized BIO.  For now,
 * just allocate full-size (16-page) BIOs.
 */
-struct mpage_data {
-        struct bio *bio;
-        sector_t last_block_in_bio;
-        get_block_t *get_block;
-        unsigned use_writepage;
-};
-static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
+int __mpage_writepage(struct page *page, struct writeback_control *wbc,
-                             void *data)
+                      void *data)
 {
        struct mpage_data *mpd = data;
        struct bio *bio = mpd->bio;
@@ -651,6 +646,7 @@ out:
        mpd->bio = bio;
        return ret;
 }
+EXPORT_SYMBOL(__mpage_writepage);
 /**
 * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them
diff --git a/fs/msdos/namei.c b/fs/msdos/namei.c
index 05ff4f1d7026..1f7f2956412a 100644
--- a/fs/msdos/namei.c
+++ b/fs/msdos/namei.c
@@ -214,7 +214,7 @@ static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry,
        dentry->d_op = &msdos_dentry_operations;
-        lock_kernel();
+        lock_super(sb);
        res = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
        if (res == -ENOENT)
                goto add;
@@ -232,7 +232,7 @@ add:
        if (dentry)
                dentry->d_op = &msdos_dentry_operations;
 out:
-        unlock_kernel();
+        unlock_super(sb);
        if (!res)
                return dentry;
        return ERR_PTR(res);
@@ -286,7 +286,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, int mode,
        unsigned char msdos_name[MSDOS_NAME];
        int err, is_hid;
-        lock_kernel();
+        lock_super(sb);
        err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
                                msdos_name, &MSDOS_SB(sb)->options);
@@ -315,7 +315,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, int mode,
        d_instantiate(dentry, inode);
 out:
-        unlock_kernel();
+        unlock_super(sb);
        if (!err)
                err = fat_flush_inodes(sb, dir, inode);
        return err;
@@ -324,11 +324,12 @@ out:
 /***** Remove a directory */
 static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
 {
+        struct super_block *sb = dir->i_sb;
        struct inode *inode = dentry->d_inode;
        struct fat_slot_info sinfo;
        int err;
-        lock_kernel();
+        lock_super(sb);
        /*
         * Check whether the directory is not in use, then check
         * whether it is empty.
@@ -349,9 +350,9 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
        inode->i_ctime = CURRENT_TIME_SEC;
        fat_detach(inode);
 out:
-        unlock_kernel();
+        unlock_super(sb);
        if (!err)
-                err = fat_flush_inodes(inode->i_sb, dir, inode);
+                err = fat_flush_inodes(sb, dir, inode);
        return err;
 }
@@ -366,7 +367,7 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        struct timespec ts;
        int err, is_hid, cluster;
-        lock_kernel();
+        lock_super(sb);
        err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
                                msdos_name, &MSDOS_SB(sb)->options);
@@ -404,14 +405,14 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        d_instantiate(dentry, inode);
-        unlock_kernel();
+        unlock_super(sb);
        fat_flush_inodes(sb, dir, inode);
        return 0;
 out_free:
        fat_free_clusters(dir, cluster);
 out:
-        unlock_kernel();
+        unlock_super(sb);
        return err;
 }
@@ -419,10 +420,11 @@ out:
 static int msdos_unlink(struct inode *dir, struct dentry *dentry)
 {
        struct inode *inode = dentry->d_inode;
+        struct super_block *sb= inode->i_sb;
        struct fat_slot_info sinfo;
        int err;
-        lock_kernel();
+        lock_super(sb);
        err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
        if (err)
                goto out;
@@ -434,9 +436,9 @@ static int msdos_unlink(struct inode *dir, struct dentry *dentry)
        inode->i_ctime = CURRENT_TIME_SEC;
        fat_detach(inode);
 out:
-        unlock_kernel();
+        unlock_super(sb);
        if (!err)
-                err = fat_flush_inodes(inode->i_sb, dir, inode);
+                err = fat_flush_inodes(sb, dir, inode);
        return err;
 }
@@ -618,10 +620,11 @@ error_inode:
 static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry,
                        struct inode *new_dir, struct dentry *new_dentry)
 {
+        struct super_block *sb = old_dir->i_sb;
        unsigned char old_msdos_name[MSDOS_NAME], new_msdos_name[MSDOS_NAME];
        int err, is_hid;
-        lock_kernel();
+        lock_super(sb);
        err = msdos_format_name(old_dentry->d_name.name,
                                old_dentry->d_name.len, old_msdos_name,
@@ -640,9 +643,9 @@ static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry,
        err = do_msdos_rename(old_dir, old_msdos_name, old_dentry,
                              new_dir, new_msdos_name, new_dentry, is_hid);
 out:
-        unlock_kernel();
+        unlock_super(sb);
        if (!err)
-                err = fat_flush_inodes(old_dir->i_sb, old_dir, new_dir);
+                err = fat_flush_inodes(sb, old_dir, new_dir);
        return err;
 }
diff --git a/fs/namespace.c b/fs/namespace.c
index 4fc302c2a0e0..4f6f7635b59c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -750,7 +750,7 @@ struct proc_fs_info {
        const char *str;
 };
-static void show_sb_opts(struct seq_file *m, struct super_block *sb)
+static int show_sb_opts(struct seq_file *m, struct super_block *sb)
 {
        static const struct proc_fs_info fs_info[] = {
                { MS_SYNCHRONOUS, ",sync" },
@@ -764,6 +764,8 @@ static void show_sb_opts(struct seq_file *m, struct super_block *sb)
                if (sb->s_flags & fs_infop->flag)
                        seq_puts(m, fs_infop->str);
        }
+        return security_sb_show_options(m, sb);
 }
 static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
@@ -806,11 +808,14 @@ static int show_vfsmnt(struct seq_file *m, void *v)
        seq_putc(m, ' ');
        show_type(m, mnt->mnt_sb);
        seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw");
-        show_sb_opts(m, mnt->mnt_sb);
+        err = show_sb_opts(m, mnt->mnt_sb);
+        if (err)
+                goto out;
        show_mnt_opts(m, mnt);
        if (mnt->mnt_sb->s_op->show_options)
                err = mnt->mnt_sb->s_op->show_options(m, mnt);
        seq_puts(m, " 0 0\n");
+out:
        return err;
 }
@@ -865,10 +870,13 @@ static int show_mountinfo(struct seq_file *m, void *v)
        seq_putc(m, ' ');
        mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
        seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw");
-        show_sb_opts(m, sb);
+        err = show_sb_opts(m, sb);
+        if (err)
+                goto out;
        if (sb->s_op->show_options)
                err = sb->s_op->show_options(m, mnt);
        seq_putc(m, '\n');
+out:
        return err;
 }
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 2b145de45b39..6a7d901f1936 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -18,6 +18,7 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/sched.h>
+#include <linux/smp_lock.h>
 #include <linux/ncp_fs.h>
 #include "ncplib_kernel.h"
@@ -281,9 +282,18 @@ static int ncp_release(struct inode *inode, struct file *file) {
        return 0;
 }
+static loff_t ncp_remote_llseek(struct file *file, loff_t offset, int origin)
+{
+        loff_t ret;
+        lock_kernel();
+        ret = generic_file_llseek_unlocked(file, offset, origin);
+        unlock_kernel();
+        return ret;
+}
 const struct file_operations ncp_file_operations =
 {
-        .llseek         = remote_llseek,
+        .llseek         = ncp_remote_llseek,
        .read           = ncp_file_read,
        .write          = ncp_file_write,
        .ioctl          = ncp_ioctl,
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 509dcb58959e..43164fe86069 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -180,6 +180,8 @@ force_reval:
 static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
 {
+        loff_t loff;
        dprintk("NFS: llseek file(%s/%s, %lld, %d)\n",
                        filp->f_path.dentry->d_parent->d_name.name,
                        filp->f_path.dentry->d_name.name,
@@ -192,7 +194,10 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
                if (retval < 0)
                        return (loff_t)retval;
        }
-        return remote_llseek(filp, offset, origin);
+        lock_kernel();  /* BKL needed? */
+        loff = generic_file_llseek_unlocked(filp, offset, origin);
+        unlock_kernel();
+        return loff;
 }
 /*
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index efc015c6128a..44f87caf3683 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -606,7 +606,9 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
        res->last_used = 0;
+        spin_lock(&dlm->spinlock);
        list_add_tail(&res->tracking, &dlm->tracking_list);
+        spin_unlock(&dlm->spinlock);
        memset(res->lvb, 0, DLM_LVB_LEN);
        memset(res->refmap, 0, sizeof(res->refmap));
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 394d25a131a5..80e20d9f2780 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1554,8 +1554,8 @@ out:
 */
 int ocfs2_file_lock(struct file *file, int ex, int trylock)
 {
-        int ret, level = ex ? LKM_EXMODE : LKM_PRMODE;
+        int ret, level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
-        unsigned int lkm_flags = trylock ? LKM_NOQUEUE : 0;
+        unsigned int lkm_flags = trylock ? DLM_LKF_NOQUEUE : 0;
        unsigned long flags;
        struct ocfs2_file_private *fp = file->private_data;
        struct ocfs2_lock_res *lockres = &fp->fp_flock;
@@ -1582,7 +1582,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
                 * Get the lock at NLMODE to start - that way we
                 * can cancel the upconvert request if need be.
                 */
-                ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0);
+                ret = ocfs2_lock_create(osb, lockres, DLM_LOCK_NL, 0);
                if (ret < 0) {
                        mlog_errno(ret);
                        goto out;
@@ -1597,7 +1597,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
        }
        lockres->l_action = OCFS2_AST_CONVERT;
-        lkm_flags |= LKM_CONVERT;
+        lkm_flags |= DLM_LKF_CONVERT;
        lockres->l_requested = level;
        lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
@@ -1664,7 +1664,7 @@ void ocfs2_file_unlock(struct file *file)
        if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED))
                return;
-        if (lockres->l_level == LKM_NLMODE)
+        if (lockres->l_level == DLM_LOCK_NL)
                return;
        mlog(0, "Unlock: \"%s\" flags: 0x%lx, level: %d, act: %d\n",
@@ -1678,11 +1678,11 @@ void ocfs2_file_unlock(struct file *file)
        lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
        lockres->l_blocking = DLM_LOCK_EX;
-        gen = ocfs2_prepare_downconvert(lockres, LKM_NLMODE);
+        gen = ocfs2_prepare_downconvert(lockres, DLM_LOCK_NL);
        lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
        spin_unlock_irqrestore(&lockres->l_lock, flags);
-        ret = ocfs2_downconvert_lock(osb, lockres, LKM_NLMODE, 0, gen);
+        ret = ocfs2_downconvert_lock(osb, lockres, DLM_LOCK_NL, 0, gen);
        if (ret) {
                mlog_errno(ret);
                return;
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index c021280dd462..bd7e0f3acfc7 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -21,6 +21,7 @@
 #include <linux/fs.h>
 #include <linux/miscdevice.h>
 #include <linux/mutex.h>
+#include <linux/smp_lock.h>
 #include <linux/reboot.h>
 #include <asm/uaccess.h>
@@ -619,10 +620,12 @@ static int ocfs2_control_open(struct inode *inode, struct file *file)
                return -ENOMEM;
        p->op_this_node = -1;
+        lock_kernel();
        mutex_lock(&ocfs2_control_lock);
        file->private_data = p;
        list_add(&p->op_list, &ocfs2_control_private_list);
        mutex_unlock(&ocfs2_control_lock);
+        unlock_kernel();
        return 0;
 }
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3b455371e7ff..58c3e6a8e15e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -233,7 +233,7 @@ static int check_mem_permission(struct task_struct *task)
         */
        if (task->parent == current && (task->ptrace & PT_PTRACED) &&
            task_is_stopped_or_traced(task) &&
-            ptrace_may_attach(task))
+            ptrace_may_access(task, PTRACE_MODE_ATTACH))
                return 0;
        /*
@@ -251,7 +251,8 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
        task_lock(task);
        if (task->mm != mm)
                goto out;
-        if (task->mm != current->mm && __ptrace_may_attach(task) < 0)
+        if (task->mm != current->mm &&
+            __ptrace_may_access(task, PTRACE_MODE_READ) < 0)
                goto out;
        task_unlock(task);
        return mm;
@@ -518,7 +519,7 @@ static int proc_fd_access_allowed(struct inode *inode)
         */
        task = get_proc_task(inode);
        if (task) {
-                allowed = ptrace_may_attach(task);
+                allowed = ptrace_may_access(task, PTRACE_MODE_READ);
                put_task_struct(task);
        }
        return allowed;
@@ -904,7 +905,7 @@ static ssize_t environ_read(struct file *file, char __user *buf,
        if (!task)
                goto out_no_task;
-        if (!ptrace_may_attach(task))
+        if (!ptrace_may_access(task, PTRACE_MODE_READ))
                goto out;
        ret = -ENOMEM;
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 7e277f2ad466..c652d469dc08 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -123,6 +123,11 @@ static int uptime_read_proc(char *page, char **start, off_t off,
        return proc_calc_metrics(page, start, off, count, eof, len);
 }
+int __attribute__((weak)) arch_report_meminfo(char *page)
+{
+        return 0;
+}
 static int meminfo_read_proc(char *page, char **start, off_t off,
                                 int count, int *eof, void *data)
 {
@@ -221,6 +226,8 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
                len += hugetlb_report_meminfo(page + len);
+        len += arch_report_meminfo(page + len);
        return proc_calc_metrics(page, start, off, count, eof, len);
 #undef K
 }
@@ -472,6 +479,13 @@ static const struct file_operations proc_vmalloc_operations = {
 };
 #endif
+#ifndef arch_irq_stat_cpu
+#define arch_irq_stat_cpu(cpu) 0
+#endif
+#ifndef arch_irq_stat
+#define arch_irq_stat() 0
+#endif
 static int show_stat(struct seq_file *p, void *v)
 {
        int i;
@@ -509,7 +523,9 @@ static int show_stat(struct seq_file *p, void *v)
                        sum += temp;
                        per_irq_sum[j] += temp;
                }
+                sum += arch_irq_stat_cpu(i);
        }
+        sum += arch_irq_stat();
        seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
                (unsigned long long)cputime64_to_clock_t(user),
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index c492449f3b45..164bd9f9ede3 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -210,7 +210,7 @@ static int show_map(struct seq_file *m, void *v)
        dev_t dev = 0;
        int len;
-        if (maps_protect && !ptrace_may_attach(task))
+        if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
                return -EACCES;
        if (file) {
@@ -646,7 +646,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
                goto out;
        ret = -EACCES;
-        if (!ptrace_may_attach(task))
+        if (!ptrace_may_access(task, PTRACE_MODE_READ))
                goto out_task;
        ret = -EINVAL;
@@ -747,7 +747,7 @@ static int show_numa_map_checked(struct seq_file *m, void *v)
        struct proc_maps_private *priv = m->private;
        struct task_struct *task = priv->task;
-        if (maps_protect && !ptrace_may_attach(task))
+        if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
                return -EACCES;
        return show_numa_map(m, v);
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 4b4f9cc2f186..5d84e7121df8 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -113,7 +113,7 @@ static int show_map(struct seq_file *m, void *_vml)
        struct proc_maps_private *priv = m->private;
        struct task_struct *task = priv->task;
-        if (maps_protect && !ptrace_may_attach(task))
+        if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
                return -EACCES;
        return nommu_vma_show(m, vml->vma);
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 9590b9024300..78f613cb9c76 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -45,6 +45,7 @@ const struct file_operations ramfs_file_operations = {
        .mmap           = generic_file_mmap,
        .fsync          = simple_sync_file,
        .splice_read    = generic_file_splice_read,
+        .splice_write   = generic_file_splice_write,
        .llseek         = generic_file_llseek,
 };
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 0989bc2c2f69..52312ec93ff4 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -43,6 +43,7 @@ const struct file_operations ramfs_file_operations = {
        .aio_write              = generic_file_aio_write,
        .fsync                  = simple_sync_file,
        .splice_read            = generic_file_splice_read,
+        .splice_write           = generic_file_splice_write,
        .llseek                 = generic_file_llseek,
 };
diff --git a/fs/read_write.c b/fs/read_write.c
index f0d1240a5c69..9ba495d5a29b 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -31,12 +31,12 @@ const struct file_operations generic_ro_fops = {
 EXPORT_SYMBOL(generic_ro_fops);
-loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
+loff_t
+generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
 {
        loff_t retval;
        struct inode *inode = file->f_mapping->host;
-        mutex_lock(&inode->i_mutex);
        switch (origin) {
                case SEEK_END:
                        offset += inode->i_size;
@@ -46,42 +46,26 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
        }
        retval = -EINVAL;
        if (offset>=0 && offset<=inode->i_sb->s_maxbytes) {
+                /* Special lock needed here? */
                if (offset != file->f_pos) {
                        file->f_pos = offset;
                        file->f_version = 0;
                }
                retval = offset;
        }
-        mutex_unlock(&inode->i_mutex);
        return retval;
 }
+EXPORT_SYMBOL(generic_file_llseek_unlocked);
-EXPORT_SYMBOL(generic_file_llseek);
+loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
-loff_t remote_llseek(struct file *file, loff_t offset, int origin)
 {
-        loff_t retval;
+        loff_t n;
+        mutex_lock(&file->f_dentry->d_inode->i_mutex);
-        lock_kernel();
+        n = generic_file_llseek_unlocked(file, offset, origin);
-        switch (origin) {
+        mutex_unlock(&file->f_dentry->d_inode->i_mutex);
-                case SEEK_END:
+        return n;
-                        offset += i_size_read(file->f_path.dentry->d_inode);
-                        break;
-                case SEEK_CUR:
-                        offset += file->f_pos;
-        }
-        retval = -EINVAL;
-        if (offset>=0 && offset<=file->f_path.dentry->d_inode->i_sb->s_maxbytes) {
-                if (offset != file->f_pos) {
-                        file->f_pos = offset;
-                        file->f_version = 0;
-                }
-                retval = offset;
-        }
-        unlock_kernel();
-        return retval;
 }
-EXPORT_SYMBOL(remote_llseek);
+EXPORT_SYMBOL(generic_file_llseek);
 loff_t no_llseek(struct file *file, loff_t offset, int origin)
 {
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index efbe29af3d7a..2294783320cb 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -422,9 +422,18 @@ smb_file_permission(struct inode *inode, int mask, struct nameidata *nd)
        return error;
 }
+static loff_t smb_remote_llseek(struct file *file, loff_t offset, int origin)
+{
+        loff_t ret;
+        lock_kernel();
+        ret = generic_file_llseek_unlocked(file, offset, origin);
+        unlock_kernel();
+        return ret;
+}
 const struct file_operations smb_file_operations =
 {
-        .llseek         = remote_llseek,
+        .llseek         = smb_remote_llseek,
        .read           = do_sync_read,
        .aio_read       = smb_file_aio_read,
        .write          = do_sync_write,
diff --git a/fs/splice.c b/fs/splice.c
index aa5f6f60b305..399442179d89 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -379,13 +379,22 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
                                lock_page(page);
                        /*
-                         * page was truncated, stop here. if this isn't the
+                         * Page was truncated, or invalidated by the
-                         * first page, we'll just complete what we already
+                         * filesystem.  Redo the find/create, but this time the
-                         * added
+                         * page is kept locked, so there's no chance of another
+                         * race with truncate/invalidate.
                         */
                        if (!page->mapping) {
                                unlock_page(page);
-                                break;
+                                page = find_or_create_page(mapping, index,
+                                                mapping_gfp_mask(mapping));
+                                if (!page) {
+                                        error = -ENOMEM;
+                                        break;
+                                }
+                                page_cache_release(pages[page_nr]);
+                                pages[page_nr] = page;
                        }
                        /*
                         * page was already under io and is now done, great
diff --git a/fs/vfat/namei.c b/fs/vfat/namei.c
index a3522727ea5b..b546ba69be82 100644
--- a/fs/vfat/namei.c
+++ b/fs/vfat/namei.c
@@ -645,7 +645,7 @@ static int vfat_add_entry(struct inode *dir, struct qstr *qname, int is_dir,
        if (len == 0)
                return -ENOENT;
-        slots = kmalloc(sizeof(*slots) * MSDOS_SLOTS, GFP_KERNEL);
+        slots = kmalloc(sizeof(*slots) * MSDOS_SLOTS, GFP_NOFS);
        if (slots == NULL)
                return -ENOMEM;
@@ -687,7 +687,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
        struct dentry *alias;
        int err, table;
-        lock_kernel();
+        lock_super(sb);
        table = (MSDOS_SB(sb)->options.name_check == 's') ? 2 : 0;
        dentry->d_op = &vfat_dentry_ops[table];
@@ -699,7 +699,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
        inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
        brelse(sinfo.bh);
        if (IS_ERR(inode)) {
-                unlock_kernel();
+                unlock_super(sb);
                return ERR_CAST(inode);
        }
        alias = d_find_alias(inode);
@@ -708,13 +708,13 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
                        dput(alias);
                else {
                        iput(inode);
-                        unlock_kernel();
+                        unlock_super(sb);
                        return alias;
                }
        }
 error:
-        unlock_kernel();
+        unlock_super(sb);
        dentry->d_op = &vfat_dentry_ops[table];
        dentry->d_time = dentry->d_parent->d_inode->i_version;
        dentry = d_splice_alias(inode, dentry);
@@ -734,7 +734,7 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, int mode,
        struct timespec ts;
        int err;
-        lock_kernel();
+        lock_super(sb);
        ts = CURRENT_TIME_SEC;
        err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &ts, &sinfo);
@@ -755,17 +755,18 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, int mode,
        dentry->d_time = dentry->d_parent->d_inode->i_version;
        d_instantiate(dentry, inode);
 out:
-        unlock_kernel();
+        unlock_super(sb);
        return err;
 }
 static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
 {
        struct inode *inode = dentry->d_inode;
+        struct super_block *sb = dir->i_sb;
        struct fat_slot_info sinfo;
        int err;
-        lock_kernel();
+        lock_super(sb);
        err = fat_dir_empty(inode);
        if (err)
@@ -783,7 +784,7 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
        inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
        fat_detach(inode);
 out:
-        unlock_kernel();
+        unlock_super(sb);
        return err;
 }
@@ -791,10 +792,11 @@ out:
 static int vfat_unlink(struct inode *dir, struct dentry *dentry)
 {
        struct inode *inode = dentry->d_inode;
+        struct super_block *sb = dir->i_sb;
        struct fat_slot_info sinfo;
        int err;
-        lock_kernel();
+        lock_super(sb);
        err = vfat_find(dir, &dentry->d_name, &sinfo);
        if (err)
@@ -807,7 +809,7 @@ static int vfat_unlink(struct inode *dir, struct dentry *dentry)
        inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
        fat_detach(inode);
 out:
-        unlock_kernel();
+        unlock_super(sb);
        return err;
 }
@@ -820,7 +822,7 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        struct timespec ts;
        int err, cluster;
-        lock_kernel();
+        lock_super(sb);
        ts = CURRENT_TIME_SEC;
        cluster = fat_alloc_new_dir(dir, &ts);
@@ -849,13 +851,13 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        dentry->d_time = dentry->d_parent->d_inode->i_version;
        d_instantiate(dentry, inode);
-        unlock_kernel();
+        unlock_super(sb);
        return 0;
 out_free:
        fat_free_clusters(dir, cluster);
 out:
-        unlock_kernel();
+        unlock_super(sb);
        return err;
 }
@@ -869,11 +871,12 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct timespec ts;
        loff_t dotdot_i_pos, new_i_pos;
        int err, is_dir, update_dotdot, corrupt = 0;
+        struct super_block *sb = old_dir->i_sb;
        old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
        old_inode = old_dentry->d_inode;
        new_inode = new_dentry->d_inode;
-        lock_kernel();
+        lock_super(sb);
        err = vfat_find(old_dir, &old_dentry->d_name, &old_sinfo);
        if (err)
                goto out;
@@ -951,7 +954,7 @@ out:
        brelse(sinfo.bh);
        brelse(dotdot_bh);
        brelse(old_sinfo.bh);
-        unlock_kernel();
+        unlock_super(sb);
        return err;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index afaee301b0ee..ad3d26ddfe31 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -2427,13 +2427,20 @@ restart:
        if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) {
                xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
-                /* If I'm the only one writing to this iclog, sync it to disk */
+                /*
-                if (atomic_read(&iclog->ic_refcnt) == 1) {
+                 * If I'm the only one writing to this iclog, sync it to disk.
+                 * We need to do an atomic compare and decrement here to avoid
+                 * racing with concurrent atomic_dec_and_lock() calls in
+                 * xlog_state_release_iclog() when there is more than one
+                 * reference to the iclog.
+                 */
+                if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) {
+                        /* we are the only one */
                        spin_unlock(&log->l_icloglock);
-                        if ((error = xlog_state_release_iclog(log, iclog)))
+                        error = xlog_state_release_iclog(log, iclog);
+                        if (error)
                                return error;
                } else {
-                        atomic_dec(&iclog->ic_refcnt);
                        spin_unlock(&log->l_icloglock);
                }
                goto restart;