64 files changed, 1624 insertions, 1497 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index c95295c65045..e83aa5ebe861 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -626,8 +626,7 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
        return NULL;
 error:
-        if (fid)
+        p9_client_clunk(fid);
-                p9_client_clunk(fid);
        return ERR_PTR(result);
 }
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 87ee5ccee348..ed8feb052df9 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -125,8 +125,8 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, int mode,
                                                        inode->i_ino);
        if (err) {
                inode_dec_link_count(inode);
-                iput(inode);
                mutex_unlock(&info->bfs_lock);
+                iput(inode);
                return err;
        }
        mutex_unlock(&info->bfs_lock);
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index c3e174b35fe6..19caf7c962ac 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -107,7 +107,8 @@ void bio_integrity_free(struct bio *bio, struct bio_set *bs)
        BUG_ON(bip == NULL);
        /* A cloned bio doesn't own the integrity metadata */
-        if (!bio_flagged(bio, BIO_CLONED) && bip->bip_buf != NULL)
+        if (!bio_flagged(bio, BIO_CLONED) && !bio_flagged(bio, BIO_FS_INTEGRITY)
+            && bip->bip_buf != NULL)
                kfree(bip->bip_buf);
        mempool_free(bip->bip_vec, bs->bvec_pools[bip->bip_pool]);
@@ -150,6 +151,24 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
 }
 EXPORT_SYMBOL(bio_integrity_add_page);
+static int bdev_integrity_enabled(struct block_device *bdev, int rw)
+{
+        struct blk_integrity *bi = bdev_get_integrity(bdev);
+        if (bi == NULL)
+                return 0;
+        if (rw == READ && bi->verify_fn != NULL &&
+            (bi->flags & INTEGRITY_FLAG_READ))
+                return 1;
+        if (rw == WRITE && bi->generate_fn != NULL &&
+            (bi->flags & INTEGRITY_FLAG_WRITE))
+                return 1;
+        return 0;
+}
 /**
 * bio_integrity_enabled - Check whether integrity can be passed
 * @bio:        bio to check
@@ -313,6 +332,14 @@ static void bio_integrity_generate(struct bio *bio)
        }
 }
+static inline unsigned short blk_integrity_tuple_size(struct blk_integrity *bi)
+{
+        if (bi)
+                return bi->tuple_size;
+        return 0;
+}
 /**
 * bio_integrity_prep - Prepare bio for integrity I/O
 * @bio:        bio to prepare
diff --git a/fs/bio.c b/fs/bio.c
index 3cba7ae34d75..77a55bcceedb 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -30,7 +30,7 @@
 static struct kmem_cache *bio_slab __read_mostly;
-mempool_t *bio_split_pool __read_mostly;
+static mempool_t *bio_split_pool __read_mostly;
 /*
 * if you change this list, also change bvec_alloc or things will
@@ -60,25 +60,46 @@ struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct
        struct bio_vec *bvl;
        /*
-         * see comment near bvec_array define!
+         * If 'bs' is given, lookup the pool and do the mempool alloc.
+         * If not, this is a bio_kmalloc() allocation and just do a
+         * kzalloc() for the exact number of vecs right away.
         */
-        switch (nr) {
+        if (bs) {
-                case   1        : *idx = 0; break;
+                /*
-                case   2 ...   4: *idx = 1; break;
+                 * see comment near bvec_array define!
-                case   5 ...  16: *idx = 2; break;
+                 */
-                case  17 ...  64: *idx = 3; break;
+                switch (nr) {
-                case  65 ... 128: *idx = 4; break;
+                case 1:
-                case 129 ... BIO_MAX_PAGES: *idx = 5; break;
+                        *idx = 0;
+                        break;
+                case 2 ... 4:
+                        *idx = 1;
+                        break;
+                case 5 ... 16:
+                        *idx = 2;
+                        break;
+                case 17 ... 64:
+                        *idx = 3;
+                        break;
+                case 65 ... 128:
+                        *idx = 4;
+                        break;
+                case 129 ... BIO_MAX_PAGES:
+                        *idx = 5;
+                        break;
                default:
                        return NULL;
-        }
+                }
-        /*
-         * idx now points to the pool we want to allocate from
-         */
-        bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask);
+                /*
-        if (bvl)
+                 * idx now points to the pool we want to allocate from
-                memset(bvl, 0, bvec_nr_vecs(*idx) * sizeof(struct bio_vec));
+                 */
+                bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask);
+                if (bvl)
+                        memset(bvl, 0,
+                                bvec_nr_vecs(*idx) * sizeof(struct bio_vec));
+        } else
+                bvl = kzalloc(nr * sizeof(struct bio_vec), gfp_mask);
        return bvl;
 }
@@ -107,10 +128,17 @@ static void bio_fs_destructor(struct bio *bio)
        bio_free(bio, fs_bio_set);
 }
+static void bio_kmalloc_destructor(struct bio *bio)
+{
+        kfree(bio->bi_io_vec);
+        kfree(bio);
+}
 void bio_init(struct bio *bio)
 {
        memset(bio, 0, sizeof(*bio));
        bio->bi_flags = 1 << BIO_UPTODATE;
+        bio->bi_comp_cpu = -1;
        atomic_set(&bio->bi_cnt, 1);
 }
@@ -118,19 +146,25 @@ void bio_init(struct bio *bio)
 * bio_alloc_bioset - allocate a bio for I/O
 * @gfp_mask:   the GFP_ mask given to the slab allocator
 * @nr_iovecs:  number of iovecs to pre-allocate
- * @bs:         the bio_set to allocate from
+ * @bs:         the bio_set to allocate from. If %NULL, just use kmalloc
 *
 * Description:
- *   bio_alloc_bioset will first try it's on mempool to satisfy the allocation.
+ *   bio_alloc_bioset will first try its own mempool to satisfy the allocation.
 *   If %__GFP_WAIT is set then we will block on the internal pool waiting
- *   for a &struct bio to become free.
+ *   for a &struct bio to become free. If a %NULL @bs is passed in, we will
+ *   fall back to just using @kmalloc to allocate the required memory.
 *
 *   allocate bio and iovecs from the memory pools specified by the
- *   bio_set structure.
+ *   bio_set structure, or @kmalloc if none given.
 **/
 struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 {
-        struct bio *bio = mempool_alloc(bs->bio_pool, gfp_mask);
+        struct bio *bio;
+        if (bs)
+                bio = mempool_alloc(bs->bio_pool, gfp_mask);
+        else
+                bio = kmalloc(sizeof(*bio), gfp_mask);
        if (likely(bio)) {
                struct bio_vec *bvl = NULL;
@@ -141,7 +175,10 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
                        bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
                        if (unlikely(!bvl)) {
-                                mempool_free(bio, bs->bio_pool);
+                                if (bs)
+                                        mempool_free(bio, bs->bio_pool);
+                                else
+                                        kfree(bio);
                                bio = NULL;
                                goto out;
                        }
@@ -164,6 +201,23 @@ struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
        return bio;
 }
+/*
+ * Like bio_alloc(), but doesn't use a mempool backing. This means that
+ * it CAN fail, but while bio_alloc() can only be used for allocations
+ * that have a short (finite) life span, bio_kmalloc() should be used
+ * for more permanent bio allocations (like allocating some bio's for
+ * initalization or setup purposes).
+ */
+struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs)
+{
+        struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, NULL);
+        if (bio)
+                bio->bi_destructor = bio_kmalloc_destructor;
+        return bio;
+}
 void zero_fill_bio(struct bio *bio)
 {
        unsigned long flags;
@@ -208,14 +262,6 @@ inline int bio_phys_segments(struct request_queue *q, struct bio *bio)
        return bio->bi_phys_segments;
 }
-inline int bio_hw_segments(struct request_queue *q, struct bio *bio)
-{
-        if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
-                blk_recount_segments(q, bio);
-        return bio->bi_hw_segments;
-}
 /**
 *      __bio_clone     -       clone a bio
 *      @bio: destination bio
@@ -350,8 +396,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
         */
        while (bio->bi_phys_segments >= q->max_phys_segments
-               || bio->bi_hw_segments >= q->max_hw_segments
+               || bio->bi_phys_segments >= q->max_hw_segments) {
-               || BIOVEC_VIRT_OVERSIZE(bio->bi_size)) {
                if (retried_segments)
                        return 0;
@@ -395,13 +440,11 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
        }
        /* If we may be able to merge these biovecs, force a recount */
-        if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec) ||
+        if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
-            BIOVEC_VIRT_MERGEABLE(bvec-1, bvec)))
                bio->bi_flags &= ~(1 << BIO_SEG_VALID);
        bio->bi_vcnt++;
        bio->bi_phys_segments++;
-        bio->bi_hw_segments++;
 done:
        bio->bi_size += len;
        return len;
@@ -449,16 +492,19 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
 struct bio_map_data {
        struct bio_vec *iovecs;
-        int nr_sgvecs;
        struct sg_iovec *sgvecs;
+        int nr_sgvecs;
+        int is_our_pages;
 };
 static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio,
-                             struct sg_iovec *iov, int iov_count)
+                             struct sg_iovec *iov, int iov_count,
+                             int is_our_pages)
 {
        memcpy(bmd->iovecs, bio->bi_io_vec, sizeof(struct bio_vec) * bio->bi_vcnt);
        memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count);
        bmd->nr_sgvecs = iov_count;
+        bmd->is_our_pages = is_our_pages;
        bio->bi_private = bmd;
 }
@@ -493,7 +539,8 @@ static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count,
 }
 static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
-                          struct sg_iovec *iov, int iov_count, int uncopy)
+                          struct sg_iovec *iov, int iov_count, int uncopy,
+                          int do_free_page)
 {
        int ret = 0, i;
        struct bio_vec *bvec;
@@ -536,7 +583,7 @@ static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
                        }
                }
-                if (uncopy)
+                if (do_free_page)
                        __free_page(bvec->bv_page);
        }
@@ -553,10 +600,11 @@ static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
 int bio_uncopy_user(struct bio *bio)
 {
        struct bio_map_data *bmd = bio->bi_private;
-        int ret;
+        int ret = 0;
-        ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs, bmd->nr_sgvecs, 1);
+        if (!bio_flagged(bio, BIO_NULL_MAPPED))
+                ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs,
+                                     bmd->nr_sgvecs, 1, bmd->is_our_pages);
        bio_free_map_data(bmd);
        bio_put(bio);
        return ret;
@@ -565,16 +613,20 @@ int bio_uncopy_user(struct bio *bio)
 /**
 *      bio_copy_user_iov       -       copy user data to bio
 *      @q: destination block queue
+ *      @map_data: pointer to the rq_map_data holding pages (if necessary)
 *      @iov:   the iovec.
 *      @iov_count: number of elements in the iovec
 *      @write_to_vm: bool indicating writing to pages or not
+ *      @gfp_mask: memory allocation flags
 *
 *      Prepares and returns a bio for indirect user io, bouncing data
 *      to/from kernel pages as necessary. Must be paired with
 *      call bio_uncopy_user() on io completion.
 */
-struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov,
+struct bio *bio_copy_user_iov(struct request_queue *q,
-                              int iov_count, int write_to_vm)
+                              struct rq_map_data *map_data,
+                              struct sg_iovec *iov, int iov_count,
+                              int write_to_vm, gfp_t gfp_mask)
 {
        struct bio_map_data *bmd;
        struct bio_vec *bvec;
@@ -597,25 +649,38 @@ struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov,
                len += iov[i].iov_len;
        }
-        bmd = bio_alloc_map_data(nr_pages, iov_count, GFP_KERNEL);
+        bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask);
        if (!bmd)
                return ERR_PTR(-ENOMEM);
        ret = -ENOMEM;
-        bio = bio_alloc(GFP_KERNEL, nr_pages);
+        bio = bio_alloc(gfp_mask, nr_pages);
        if (!bio)
                goto out_bmd;
        bio->bi_rw |= (!write_to_vm << BIO_RW);
        ret = 0;
+        i = 0;
        while (len) {
-                unsigned int bytes = PAGE_SIZE;
+                unsigned int bytes;
+                if (map_data)
+                        bytes = 1U << (PAGE_SHIFT + map_data->page_order);
+                else
+                        bytes = PAGE_SIZE;
                if (bytes > len)
                        bytes = len;
-                page = alloc_page(q->bounce_gfp | GFP_KERNEL);
+                if (map_data) {
+                        if (i == map_data->nr_entries) {
+                                ret = -ENOMEM;
+                                break;
+                        }
+                        page = map_data->pages[i++];
+                } else
+                        page = alloc_page(q->bounce_gfp | gfp_mask);
                if (!page) {
                        ret = -ENOMEM;
                        break;
@@ -634,16 +699,17 @@ struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov,
         * success
         */
        if (!write_to_vm) {
-                ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0);
+                ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 0);
                if (ret)
                        goto cleanup;
        }
-        bio_set_map_data(bmd, bio, iov, iov_count);
+        bio_set_map_data(bmd, bio, iov, iov_count, map_data ? 0 : 1);
        return bio;
 cleanup:
-        bio_for_each_segment(bvec, bio, i)
+        if (!map_data)
-                __free_page(bvec->bv_page);
+                bio_for_each_segment(bvec, bio, i)
+                        __free_page(bvec->bv_page);
        bio_put(bio);
 out_bmd:
@@ -654,29 +720,32 @@ out_bmd:
 /**
 *      bio_copy_user   -       copy user data to bio
 *      @q: destination block queue
+ *      @map_data: pointer to the rq_map_data holding pages (if necessary)
 *      @uaddr: start of user address
 *      @len: length in bytes
 *      @write_to_vm: bool indicating writing to pages or not
+ *      @gfp_mask: memory allocation flags
 *
 *      Prepares and returns a bio for indirect user io, bouncing data
 *      to/from kernel pages as necessary. Must be paired with
 *      call bio_uncopy_user() on io completion.
 */
-struct bio *bio_copy_user(struct request_queue *q, unsigned long uaddr,
+struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data,
-                          unsigned int len, int write_to_vm)
+                          unsigned long uaddr, unsigned int len,
+                          int write_to_vm, gfp_t gfp_mask)
 {
        struct sg_iovec iov;
        iov.iov_base = (void __user *)uaddr;
        iov.iov_len = len;
-        return bio_copy_user_iov(q, &iov, 1, write_to_vm);
+        return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask);
 }
 static struct bio *__bio_map_user_iov(struct request_queue *q,
                                      struct block_device *bdev,
                                      struct sg_iovec *iov, int iov_count,
-                                      int write_to_vm)
+                                      int write_to_vm, gfp_t gfp_mask)
 {
        int i, j;
        int nr_pages = 0;
@@ -702,12 +771,12 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
        if (!nr_pages)
                return ERR_PTR(-EINVAL);
-        bio = bio_alloc(GFP_KERNEL, nr_pages);
+        bio = bio_alloc(gfp_mask, nr_pages);
        if (!bio)
                return ERR_PTR(-ENOMEM);
        ret = -ENOMEM;
-        pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
+        pages = kcalloc(nr_pages, sizeof(struct page *), gfp_mask);
        if (!pages)
                goto out;
@@ -786,19 +855,21 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
 *      @uaddr: start of user address
 *      @len: length in bytes
 *      @write_to_vm: bool indicating writing to pages or not
+ *      @gfp_mask: memory allocation flags
 *
 *      Map the user space address into a bio suitable for io to a block
 *      device. Returns an error pointer in case of error.
 */
 struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev,
-                         unsigned long uaddr, unsigned int len, int write_to_vm)
+                         unsigned long uaddr, unsigned int len, int write_to_vm,
+                         gfp_t gfp_mask)
 {
        struct sg_iovec iov;
        iov.iov_base = (void __user *)uaddr;
        iov.iov_len = len;
-        return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm);
+        return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask);
 }
 /**
@@ -808,18 +879,19 @@ struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev,
 *      @iov:   the iovec.
 *      @iov_count: number of elements in the iovec
 *      @write_to_vm: bool indicating writing to pages or not
+ *      @gfp_mask: memory allocation flags
 *
 *      Map the user space address into a bio suitable for io to a block
 *      device. Returns an error pointer in case of error.
 */
 struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev,
                             struct sg_iovec *iov, int iov_count,
-                             int write_to_vm)
+                             int write_to_vm, gfp_t gfp_mask)
 {
        struct bio *bio;
-        bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm);
+        bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm,
+                                 gfp_mask);
        if (IS_ERR(bio))
                return bio;
@@ -976,48 +1048,13 @@ static void bio_copy_kern_endio(struct bio *bio, int err)
 struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
                          gfp_t gfp_mask, int reading)
 {
-        unsigned long kaddr = (unsigned long)data;
-        unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-        unsigned long start = kaddr >> PAGE_SHIFT;
-        const int nr_pages = end - start;
        struct bio *bio;
        struct bio_vec *bvec;
-        struct bio_map_data *bmd;
+        int i;
-        int i, ret;
-        struct sg_iovec iov;
-        iov.iov_base = data;
-        iov.iov_len = len;
-        bmd = bio_alloc_map_data(nr_pages, 1, gfp_mask);
-        if (!bmd)
-                return ERR_PTR(-ENOMEM);
-        ret = -ENOMEM;
-        bio = bio_alloc(gfp_mask, nr_pages);
-        if (!bio)
-                goto out_bmd;
-        while (len) {
-                struct page *page;
-                unsigned int bytes = PAGE_SIZE;
-                if (bytes > len)
-                        bytes = len;
-                page = alloc_page(q->bounce_gfp | gfp_mask);
-                if (!page) {
-                        ret = -ENOMEM;
-                        goto cleanup;
-                }
-                if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) {
-                        ret = -EINVAL;
-                        goto cleanup;
-                }
-                len -= bytes;
+        bio = bio_copy_user(q, NULL, (unsigned long)data, len, 1, gfp_mask);
-        }
+        if (IS_ERR(bio))
+                return bio;
        if (!reading) {
                void *p = data;
@@ -1030,20 +1067,9 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
                }
        }
-        bio->bi_private = bmd;
        bio->bi_end_io = bio_copy_kern_endio;
-        bio_set_map_data(bmd, bio, &iov, 1);
        return bio;
-cleanup:
-        bio_for_each_segment(bvec, bio, i)
-                __free_page(bvec->bv_page);
-        bio_put(bio);
-out_bmd:
-        bio_free_map_data(bmd);
-        return ERR_PTR(ret);
 }
 /*
@@ -1230,9 +1256,9 @@ static void bio_pair_end_2(struct bio *bi, int err)
 * split a bio - only worry about a bio with a single page
 * in it's iovec
 */
-struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
+struct bio_pair *bio_split(struct bio *bi, int first_sectors)
 {
-        struct bio_pair *bp = mempool_alloc(pool, GFP_NOIO);
+        struct bio_pair *bp = mempool_alloc(bio_split_pool, GFP_NOIO);
        if (!bp)
                return bp;
@@ -1266,7 +1292,7 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
        bp->bio2.bi_end_io = bio_pair_end_2;
        bp->bio1.bi_private = bi;
-        bp->bio2.bi_private = pool;
+        bp->bio2.bi_private = bio_split_pool;
        if (bio_integrity(bi))
                bio_integrity_split(bi, bp, first_sectors);
@@ -1274,6 +1300,42 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
        return bp;
 }
+/**
+ *      bio_sector_offset - Find hardware sector offset in bio
+ *      @bio:           bio to inspect
+ *      @index:         bio_vec index
+ *      @offset:        offset in bv_page
+ *
+ *      Return the number of hardware sectors between beginning of bio
+ *      and an end point indicated by a bio_vec index and an offset
+ *      within that vector's page.
+ */
+sector_t bio_sector_offset(struct bio *bio, unsigned short index,
+                           unsigned int offset)
+{
+        unsigned int sector_sz = queue_hardsect_size(bio->bi_bdev->bd_disk->queue);
+        struct bio_vec *bv;
+        sector_t sectors;
+        int i;
+        sectors = 0;
+        if (index >= bio->bi_idx)
+                index = bio->bi_vcnt - 1;
+        __bio_for_each_segment(bv, bio, i, 0) {
+                if (i == index) {
+                        if (offset > bv->bv_offset)
+                                sectors += (offset - bv->bv_offset) / sector_sz;
+                        break;
+                }
+                sectors += bv->bv_len / sector_sz;
+        }
+        return sectors;
+}
+EXPORT_SYMBOL(bio_sector_offset);
 /*
 * create memory pools for biovec's in a bio_set.
@@ -1376,6 +1438,7 @@ static int __init init_bio(void)
 subsys_initcall(init_bio);
 EXPORT_SYMBOL(bio_alloc);
+EXPORT_SYMBOL(bio_kmalloc);
 EXPORT_SYMBOL(bio_put);
 EXPORT_SYMBOL(bio_free);
 EXPORT_SYMBOL(bio_endio);
@@ -1383,7 +1446,6 @@ EXPORT_SYMBOL(bio_init);
 EXPORT_SYMBOL(__bio_clone);
 EXPORT_SYMBOL(bio_clone);
 EXPORT_SYMBOL(bio_phys_segments);
-EXPORT_SYMBOL(bio_hw_segments);
 EXPORT_SYMBOL(bio_add_page);
 EXPORT_SYMBOL(bio_add_pc_page);
 EXPORT_SYMBOL(bio_get_nr_vecs);
@@ -1393,7 +1455,6 @@ EXPORT_SYMBOL(bio_map_kern);
 EXPORT_SYMBOL(bio_copy_kern);
 EXPORT_SYMBOL(bio_pair_release);
 EXPORT_SYMBOL(bio_split);
-EXPORT_SYMBOL(bio_split_pool);
 EXPORT_SYMBOL(bio_copy_user);
 EXPORT_SYMBOL(bio_uncopy_user);
 EXPORT_SYMBOL(bioset_create);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index aff54219e049..d84f0469a016 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -540,22 +540,6 @@ EXPORT_SYMBOL(bd_release);
 *           /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
 */
-static struct kobject *bdev_get_kobj(struct block_device *bdev)
-{
-        if (bdev->bd_contains != bdev)
-                return kobject_get(&bdev->bd_part->dev.kobj);
-        else
-                return kobject_get(&bdev->bd_disk->dev.kobj);
-}
-static struct kobject *bdev_get_holder(struct block_device *bdev)
-{
-        if (bdev->bd_contains != bdev)
-                return kobject_get(bdev->bd_part->holder_dir);
-        else
-                return kobject_get(bdev->bd_disk->holder_dir);
-}
 static int add_symlink(struct kobject *from, struct kobject *to)
 {
        if (!from || !to)
@@ -604,11 +588,11 @@ static int bd_holder_grab_dirs(struct block_device *bdev,
        if (!bo->hdev)
                goto fail_put_sdir;
-        bo->sdev = bdev_get_kobj(bdev);
+        bo->sdev = kobject_get(&part_to_dev(bdev->bd_part)->kobj);
        if (!bo->sdev)
                goto fail_put_hdev;
-        bo->hdir = bdev_get_holder(bdev);
+        bo->hdir = kobject_get(bdev->bd_part->holder_dir);
        if (!bo->hdir)
                goto fail_put_sdev;
@@ -868,6 +852,87 @@ struct block_device *open_by_devnum(dev_t dev, unsigned mode)
 EXPORT_SYMBOL(open_by_devnum);
+/**
+ * flush_disk - invalidates all buffer-cache entries on a disk
+ *
+ * @bdev:      struct block device to be flushed
+ *
+ * Invalidates all buffer-cache entries on a disk. It should be called
+ * when a disk has been changed -- either by a media change or online
+ * resize.
+ */
+static void flush_disk(struct block_device *bdev)
+{
+        if (__invalidate_device(bdev)) {
+                char name[BDEVNAME_SIZE] = "";
+                if (bdev->bd_disk)
+                        disk_name(bdev->bd_disk, 0, name);
+                printk(KERN_WARNING "VFS: busy inodes on changed media or "
+                       "resized disk %s\n", name);
+        }
+        if (!bdev->bd_disk)
+                return;
+        if (disk_partitionable(bdev->bd_disk))
+                bdev->bd_invalidated = 1;
+}
+/**
+ * check_disk_size_change - checks for disk size change and adjusts bdev size.
+ * @disk: struct gendisk to check
+ * @bdev: struct bdev to adjust.
+ *
+ * This routine checks to see if the bdev size does not match the disk size
+ * and adjusts it if it differs.
+ */
+void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
+{
+        loff_t disk_size, bdev_size;
+        disk_size = (loff_t)get_capacity(disk) << 9;
+        bdev_size = i_size_read(bdev->bd_inode);
+        if (disk_size != bdev_size) {
+                char name[BDEVNAME_SIZE];
+                disk_name(disk, 0, name);
+                printk(KERN_INFO
+                       "%s: detected capacity change from %lld to %lld\n",
+                       name, bdev_size, disk_size);
+                i_size_write(bdev->bd_inode, disk_size);
+                flush_disk(bdev);
+        }
+}
+EXPORT_SYMBOL(check_disk_size_change);
+/**
+ * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back
+ * @disk: struct gendisk to be revalidated
+ *
+ * This routine is a wrapper for lower-level driver's revalidate_disk
+ * call-backs.  It is used to do common pre and post operations needed
+ * for all revalidate_disk operations.
+ */
+int revalidate_disk(struct gendisk *disk)
+{
+        struct block_device *bdev;
+        int ret = 0;
+        if (disk->fops->revalidate_disk)
+                ret = disk->fops->revalidate_disk(disk);
+        bdev = bdget_disk(disk, 0);
+        if (!bdev)
+                return ret;
+        mutex_lock(&bdev->bd_mutex);
+        check_disk_size_change(disk, bdev);
+        mutex_unlock(&bdev->bd_mutex);
+        bdput(bdev);
+        return ret;
+}
+EXPORT_SYMBOL(revalidate_disk);
 /*
 * This routine checks whether a removable media has been changed,
 * and invalidates all buffer-cache-entries in that case. This
@@ -887,13 +952,9 @@ int check_disk_change(struct block_device *bdev)
        if (!bdops->media_changed(bdev->bd_disk))
                return 0;
-        if (__invalidate_device(bdev))
+        flush_disk(bdev);
-                printk("VFS: busy inodes on changed media.\n");
        if (bdops->revalidate_disk)
                bdops->revalidate_disk(bdev->bd_disk);
-        if (bdev->bd_disk->minors > 1)
-                bdev->bd_invalidated = 1;
        return 1;
 }
@@ -927,10 +988,10 @@ static int __blkdev_put(struct block_device *bdev, int for_part);
 static int do_open(struct block_device *bdev, struct file *file, int for_part)
 {
-        struct module *owner = NULL;
        struct gendisk *disk;
+        struct hd_struct *part = NULL;
        int ret;
-        int part;
+        int partno;
        int perm = 0;
        if (file->f_mode & FMODE_READ)
@@ -948,25 +1009,27 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
        ret = -ENXIO;
        file->f_mapping = bdev->bd_inode->i_mapping;
        lock_kernel();
-        disk = get_gendisk(bdev->bd_dev, &part);
-        if (!disk) {
+        disk = get_gendisk(bdev->bd_dev, &partno);
-                unlock_kernel();
+        if (!disk)
-                bdput(bdev);
+                goto out_unlock_kernel;
-                return ret;
+        part = disk_get_part(disk, partno);
-        }
+        if (!part)
-        owner = disk->fops->owner;
+                goto out_unlock_kernel;
        mutex_lock_nested(&bdev->bd_mutex, for_part);
        if (!bdev->bd_openers) {
                bdev->bd_disk = disk;
+                bdev->bd_part = part;
                bdev->bd_contains = bdev;
-                if (!part) {
+                if (!partno) {
                        struct backing_dev_info *bdi;
                        if (disk->fops->open) {
                                ret = disk->fops->open(bdev->bd_inode, file);
                                if (ret)
-                                        goto out_first;
+                                        goto out_clear;
                        }
                        if (!bdev->bd_openers) {
                                bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
@@ -978,36 +1041,36 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
                        if (bdev->bd_invalidated)
                                rescan_partitions(disk, bdev);
                } else {
-                        struct hd_struct *p;
                        struct block_device *whole;
                        whole = bdget_disk(disk, 0);
                        ret = -ENOMEM;
                        if (!whole)
-                                goto out_first;
+                                goto out_clear;
                        BUG_ON(for_part);
                        ret = __blkdev_get(whole, file->f_mode, file->f_flags, 1);
                        if (ret)
-                                goto out_first;
+                                goto out_clear;
                        bdev->bd_contains = whole;
-                        p = disk->part[part - 1];
                        bdev->bd_inode->i_data.backing_dev_info =
                           whole->bd_inode->i_data.backing_dev_info;
-                        if (!(disk->flags & GENHD_FL_UP) || !p || !p->nr_sects) {
+                        if (!(disk->flags & GENHD_FL_UP) ||
+                            !part || !part->nr_sects) {
                                ret = -ENXIO;
-                                goto out_first;
+                                goto out_clear;
                        }
-                        kobject_get(&p->dev.kobj);
+                        bd_set_size(bdev, (loff_t)part->nr_sects << 9);
-                        bdev->bd_part = p;
-                        bd_set_size(bdev, (loff_t) p->nr_sects << 9);
                }
        } else {
+                disk_put_part(part);
                put_disk(disk);
-                module_put(owner);
+                module_put(disk->fops->owner);
+                part = NULL;
+                disk = NULL;
                if (bdev->bd_contains == bdev) {
                        if (bdev->bd_disk->fops->open) {
                                ret = bdev->bd_disk->fops->open(bdev->bd_inode, file);
                                if (ret)
-                                        goto out;
+                                        goto out_unlock_bdev;
                        }
                        if (bdev->bd_invalidated)
                                rescan_partitions(bdev->bd_disk, bdev);
@@ -1020,19 +1083,24 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
        unlock_kernel();
        return 0;
-out_first:
+ out_clear:
        bdev->bd_disk = NULL;
+        bdev->bd_part = NULL;
        bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
        if (bdev != bdev->bd_contains)
                __blkdev_put(bdev->bd_contains, 1);
        bdev->bd_contains = NULL;
-        put_disk(disk);
+ out_unlock_bdev:
-        module_put(owner);
-out:
        mutex_unlock(&bdev->bd_mutex);
+ out_unlock_kernel:
        unlock_kernel();
-        if (ret)
-                bdput(bdev);
+        disk_put_part(part);
+        if (disk)
+                module_put(disk->fops->owner);
+        put_disk(disk);
+        bdput(bdev);
        return ret;
 }
@@ -1117,11 +1185,8 @@ static int __blkdev_put(struct block_device *bdev, int for_part)
                put_disk(disk);
                module_put(owner);
+                disk_put_part(bdev->bd_part);
-                if (bdev->bd_contains != bdev) {
+                bdev->bd_part = NULL;
-                        kobject_put(&bdev->bd_part->dev.kobj);
-                        bdev->bd_part = NULL;
-                }
                bdev->bd_disk = NULL;
                bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
                if (bdev != bdev->bd_contains)
@@ -1197,10 +1262,9 @@ EXPORT_SYMBOL(ioctl_by_bdev);
 /**
 * lookup_bdev  - lookup a struct block_device by name
+ * @pathname:   special file representing the block device
 *
- * @path:       special file representing the block device
+ * Get a reference to the blockdevice at @pathname in the current
- *
- * Get a reference to the blockdevice at @path in the current
 * namespace if possible and return it.  Return ERR_PTR(error)
 * otherwise.
 */
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index f9e4ad97a79e..06e521a945c3 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -9,7 +9,10 @@ files (e.g. "cp -a") to Windows servers.  For mkdir and create honor setgid bit
 on parent directory when server supports Unix Extensions but not POSIX
 create. Update cifs.upcall version to handle new Kerberos sec flags
 (this requires update of cifs.upcall program from Samba).  Fix memory leak
-on dns_upcall (resolving DFS referralls).
+on dns_upcall (resolving DFS referralls).  Fix plain text password
+authentication (requires setting SecurityFlags to 0x30030 to enable
+lanman and plain text though).  Fix writes to be at correct offset when
+file is open with O_APPEND and file is on a directio (forcediretio) mount.
 Version 1.53
 ------------
diff --git a/fs/cifs/README b/fs/cifs/README
index 68b5c1169d9d..bd2343d4c6a6 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -542,10 +542,20 @@ SecurityFlags		Flags which control security negotiation and
                        hashing mechanisms (as "must use") on the other hand 
                        does not make much sense. Default flags are 
                                0x07007 
-                        (NTLM, NTLMv2 and packet signing allowed).  Maximum 
+                        (NTLM, NTLMv2 and packet signing allowed).  The maximum 
                        allowable flags if you want to allow mounts to servers
                        using weaker password hashes is 0x37037 (lanman,
-                        plaintext, ntlm, ntlmv2, signing allowed):
+                        plaintext, ntlm, ntlmv2, signing allowed).  Some
+                        SecurityFlags require the corresponding menuconfig
+                        options to be enabled (lanman and plaintext require
+                        CONFIG_CIFS_WEAK_PW_HASH for example).  Enabling
+                        plaintext authentication currently requires also
+                        enabling lanman authentication in the security flags
+                        because the cifs module only supports sending
+                        laintext passwords using the older lanman dialect
+                        form of the session setup SMB.  (e.g. for authentication
+                        using plain text passwords, set the SecurityFlags
+                        to 0x30030):
 
                        may use packet signing                          0x00001
                        must use packet signing                         0x01001
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 83fd40dc1ef0..bd5f13d38450 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -294,6 +294,7 @@ void calc_lanman_hash(struct cifsSesInfo *ses, char *lnm_session_key)
        if ((ses->server->secMode & SECMODE_PW_ENCRYPT) == 0)
                if (extended_security & CIFSSEC_MAY_PLNTXT) {
+                        memset(lnm_session_key, 0, CIFS_SESS_KEY_SIZE);
                        memcpy(lnm_session_key, password_with_pad,
                                CIFS_ENCPWD_SIZE);
                        return;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index ff14d14903a0..cbefe1f1f9fe 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -833,6 +833,10 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
                return -EBADF;
        open_file = (struct cifsFileInfo *) file->private_data;
+        rc = generic_write_checks(file, poffset, &write_size, 0);
+        if (rc)
+                return rc;
        xid = GetXid();
        if (*poffset > file->f_path.dentry->d_inode->i_size)
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index b537fad3bf50..252fdc0567f1 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -409,6 +409,8 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
                char lnm_session_key[CIFS_SESS_KEY_SIZE];
+                pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE;
                /* no capabilities flags in old lanman negotiation */
                pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE);
diff --git a/fs/dcache.c b/fs/dcache.c
index 80e93956aced..e7a1a99b7464 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1395,6 +1395,10 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
                if (dentry->d_parent != parent)
                        goto next;
+                /* non-existing due to RCU? */
+                if (d_unhashed(dentry))
+                        goto next;
                /*
                 * It is safe to compare names since d_move() cannot
                 * change the qstr (protected by d_lock).
@@ -1410,10 +1414,8 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
                                goto next;
                }
-                if (!d_unhashed(dentry)) {
+                atomic_inc(&dentry->d_count);
-                        atomic_inc(&dentry->d_count);
+                found = dentry;
-                        found = dentry;
-                }
                spin_unlock(&dentry->d_lock);
                break;
 next:
diff --git a/fs/exec.c b/fs/exec.c
index 32993beecbe9..cecee501ce78 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -752,11 +752,11 @@ static int exec_mmap(struct mm_struct *mm)
        tsk->active_mm = mm;
        activate_mm(active_mm, mm);
        task_unlock(tsk);
-        mm_update_next_owner(old_mm);
        arch_pick_mmap_layout(mm);
        if (old_mm) {
                up_read(&old_mm->mmap_sem);
                BUG_ON(active_mm != old_mm);
+                mm_update_next_owner(old_mm);
                mmput(old_mm);
                return 0;
        }
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 302e95c4af7e..fb98b3d847ed 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -6,6 +6,7 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/msdos_fs.h>
+#include <linux/blkdev.h>
 struct fatent_operations {
        void (*ent_blocknr)(struct super_block *, int, int *, sector_t *);
@@ -535,6 +536,7 @@ int fat_free_clusters(struct inode *inode, int cluster)
        struct fat_entry fatent;
        struct buffer_head *bhs[MAX_BUF_PER_PAGE];
        int i, err, nr_bhs;
+        int first_cl = cluster;
        nr_bhs = 0;
        fatent_init(&fatent);
@@ -551,6 +553,18 @@ int fat_free_clusters(struct inode *inode, int cluster)
                        goto error;
                }
+                /* 
+                 * Issue discard for the sectors we no longer care about,
+                 * batching contiguous clusters into one request
+                 */
+                if (cluster != fatent.entry + 1) {
+                        int nr_clus = fatent.entry - first_cl + 1;
+                        sb_issue_discard(sb, fat_clus_to_blknr(sbi, first_cl),
+                                         nr_clus * sbi->sec_per_clus);
+                        first_cl = cluster;
+                }
                ops->ent_put(&fatent, FAT_ENT_FREE);
                if (sbi->free_clusters != -1) {
                        sbi->free_clusters++;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 13391e546616..c962283d4e7f 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1265,6 +1265,8 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
        holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
        if (time_before(now, holdtime))
                delay = holdtime - now;
+        if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
+                delay = gl->gl_ops->go_min_hold_time;
        spin_lock(&gl->gl_spin);
        handle_callback(gl, state, 1, delay);
@@ -1578,8 +1580,6 @@ static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
                *p++ = 'a';
        if (flags & GL_EXACT)
                *p++ = 'E';
-        if (flags & GL_ATIME)
-                *p++ = 'a';
        if (flags & GL_NOCACHE)
                *p++ = 'c';
        if (test_bit(HIF_HOLDER, &iflags))
@@ -1816,15 +1816,17 @@ restart:
        if (gl) {
                gi->gl = hlist_entry(gl->gl_list.next,
                                     struct gfs2_glock, gl_list);
-                if (gi->gl)
+        } else {
-                        gfs2_glock_hold(gi->gl);
+                gi->gl = hlist_entry(gl_hash_table[gi->hash].hb_list.first,
+                                     struct gfs2_glock, gl_list);
        }
+        if (gi->gl)
+                gfs2_glock_hold(gi->gl);
        read_unlock(gl_lock_addr(gi->hash));
        if (gl)
                gfs2_glock_put(gl);
-        if (gl && gi->gl == NULL)
-                gi->hash++;
        while (gi->gl == NULL) {
+                gi->hash++;
                if (gi->hash >= GFS2_GL_HASH_SIZE)
                        return 1;
                read_lock(gl_lock_addr(gi->hash));
@@ -1833,7 +1835,6 @@ restart:
                if (gi->gl)
                        gfs2_glock_hold(gi->gl);
                read_unlock(gl_lock_addr(gi->hash));
-                gi->hash++;
        }
        if (gi->sdp != gi->gl->gl_sbd)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 971d92af70fc..695c6b193611 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -24,7 +24,6 @@
 #define GL_ASYNC                0x00000040
 #define GL_EXACT                0x00000080
 #define GL_SKIP                 0x00000100
-#define GL_ATIME                0x00000200
 #define GL_NOCACHE              0x00000400
 #define GLR_TRYFAILED           13
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 448697a5c462..f566ec1b4e8e 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -386,20 +386,21 @@ struct gfs2_statfs_change_host {
 #define GFS2_DATA_ORDERED       2
 struct gfs2_args {
-        char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */
+        char ar_lockproto[GFS2_LOCKNAME_LEN];   /* Name of the Lock Protocol */
-        char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */
+        char ar_locktable[GFS2_LOCKNAME_LEN];   /* Name of the Lock Table */
-        char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */
+        char ar_hostdata[GFS2_LOCKNAME_LEN];    /* Host specific data */
-        int ar_spectator; /* Don't get a journal because we're always RO */
+        unsigned int ar_spectator:1;            /* Don't get a journal */
-        int ar_ignore_local_fs; /* Don't optimize even if local_fs is 1 */
+        unsigned int ar_ignore_local_fs:1;      /* Ignore optimisations */
-        int ar_localflocks; /* Let the VFS do flock|fcntl locks for us */
+        unsigned int ar_localflocks:1;          /* Let the VFS do flock|fcntl */
-        int ar_localcaching; /* Local-style caching (dangerous on multihost) */
+        unsigned int ar_localcaching:1;         /* Local caching */
-        int ar_debug; /* Oops on errors instead of trying to be graceful */
+        unsigned int ar_debug:1;                /* Oops on errors */
-        int ar_upgrade; /* Upgrade ondisk/multihost format */
+        unsigned int ar_upgrade:1;              /* Upgrade ondisk format */
-        unsigned int ar_num_glockd; /* Number of glockd threads */
+        unsigned int ar_posix_acl:1;            /* Enable posix acls */
-        int ar_posix_acl; /* Enable posix acls */
+        unsigned int ar_quota:2;                /* off/account/on */
-        int ar_quota; /* off/account/on */
+        unsigned int ar_suiddir:1;              /* suiddir support */
-        int ar_suiddir; /* suiddir support */
+        unsigned int ar_data:2;                 /* ordered/writeback */
-        int ar_data; /* ordered/writeback */
+        unsigned int ar_meta:1;                 /* mount metafs */
+        unsigned int ar_num_glockd;             /* Number of glockd threads */
 };
 struct gfs2_tune {
@@ -419,7 +420,6 @@ struct gfs2_tune {
        unsigned int gt_quota_scale_den; /* Denominator */
        unsigned int gt_quota_cache_secs;
        unsigned int gt_quota_quantum; /* Secs between syncs to quota file */
-        unsigned int gt_atime_quantum; /* Min secs between atime updates */
        unsigned int gt_new_files_jdata;
        unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
        unsigned int gt_stall_secs; /* Detects trouble! */
@@ -432,7 +432,7 @@ enum {
        SDF_JOURNAL_CHECKED     = 0,
        SDF_JOURNAL_LIVE        = 1,
        SDF_SHUTDOWN            = 2,
-        SDF_NOATIME             = 3,
+        SDF_NOBARRIERS          = 3,
 };
 #define GFS2_FSNAME_LEN         256
@@ -461,7 +461,6 @@ struct gfs2_sb_host {
 struct gfs2_sbd {
        struct super_block *sd_vfs;
-        struct super_block *sd_vfs_meta;
        struct kobject sd_kobj;
        unsigned long sd_flags; /* SDF_... */
        struct gfs2_sb_host sd_sb;
@@ -499,7 +498,9 @@ struct gfs2_sbd {
        /* Inode Stuff */
-        struct inode *sd_master_dir;
+        struct dentry *sd_master_dir;
+        struct dentry *sd_root_dir;
        struct inode *sd_jindex;
        struct inode *sd_inum_inode;
        struct inode *sd_statfs_inode;
@@ -634,7 +635,6 @@ struct gfs2_sbd {
        /* Debugging crud */
        unsigned long sd_last_warning;
-        struct vfsmount *sd_gfs2mnt;
        struct dentry *debugfs_dir;    /* debugfs directory */
        struct dentry *debugfs_dentry_glocks; /* for debugfs */
 };
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 8b0806a32948..7cee695fa441 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -18,6 +18,7 @@
 #include <linux/crc32.h>
 #include <linux/lm_interface.h>
 #include <linux/security.h>
+#include <linux/time.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -249,6 +250,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 {
        struct gfs2_dinode_host *di = &ip->i_di;
        const struct gfs2_dinode *str = buf;
+        struct timespec atime;
        u16 height, depth;
        if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr)))
@@ -275,8 +277,10 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
        di->di_size = be64_to_cpu(str->di_size);
        i_size_write(&ip->i_inode, di->di_size);
        gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
-        ip->i_inode.i_atime.tv_sec = be64_to_cpu(str->di_atime);
+        atime.tv_sec = be64_to_cpu(str->di_atime);
-        ip->i_inode.i_atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
+        atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
+        if (timespec_compare(&ip->i_inode.i_atime, &atime) < 0)
+                ip->i_inode.i_atime = atime;
        ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime);
        ip->i_inode.i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec);
        ip->i_inode.i_ctime.tv_sec = be64_to_cpu(str->di_ctime);
@@ -1033,13 +1037,11 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
        if (bh)
                brelse(bh);
-        if (!inode)
-                return ERR_PTR(-ENOMEM);
        return inode;
 fail_gunlock2:
        gfs2_glock_dq_uninit(ghs + 1);
-        if (inode)
+        if (inode && !IS_ERR(inode))
                iput(inode);
 fail_gunlock:
        gfs2_glock_dq(ghs);
@@ -1140,54 +1142,6 @@ int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
        return 0;
 }
-/*
- * gfs2_ok_to_move - check if it's ok to move a directory to another directory
- * @this: move this
- * @to: to here
- *
- * Follow @to back to the root and make sure we don't encounter @this
- * Assumes we already hold the rename lock.
- *
- * Returns: errno
- */
-int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
-{
-        struct inode *dir = &to->i_inode;
-        struct super_block *sb = dir->i_sb;
-        struct inode *tmp;
-        struct qstr dotdot;
-        int error = 0;
-        gfs2_str2qstr(&dotdot, "..");
-        igrab(dir);
-        for (;;) {
-                if (dir == &this->i_inode) {
-                        error = -EINVAL;
-                        break;
-                }
-                if (dir == sb->s_root->d_inode) {
-                        error = 0;
-                        break;
-                }
-                tmp = gfs2_lookupi(dir, &dotdot, 1);
-                if (IS_ERR(tmp)) {
-                        error = PTR_ERR(tmp);
-                        break;
-                }
-                iput(dir);
-                dir = tmp;
-        }
-        iput(dir);
-        return error;
-}
 /**
 * gfs2_readlinki - return the contents of a symlink
 * @ip: the symlink's inode
@@ -1207,8 +1161,8 @@ int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
        unsigned int x;
        int error;
-        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh);
+        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
-        error = gfs2_glock_nq_atime(&i_gh);
+        error = gfs2_glock_nq(&i_gh);
        if (error) {
                gfs2_holder_uninit(&i_gh);
                return error;
@@ -1243,101 +1197,6 @@ out:
        return error;
 }
-/**
- * gfs2_glock_nq_atime - Acquire a hold on an inode's glock, and
- *       conditionally update the inode's atime
- * @gh: the holder to acquire
- *
- * Tests atime (access time) for gfs2_read, gfs2_readdir and gfs2_mmap
- * Update if the difference between the current time and the inode's current
- * atime is greater than an interval specified at mount.
- *
- * Returns: errno
- */
-int gfs2_glock_nq_atime(struct gfs2_holder *gh)
-{
-        struct gfs2_glock *gl = gh->gh_gl;
-        struct gfs2_sbd *sdp = gl->gl_sbd;
-        struct gfs2_inode *ip = gl->gl_object;
-        s64 quantum = gfs2_tune_get(sdp, gt_atime_quantum);
-        unsigned int state;
-        int flags;
-        int error;
-        struct timespec tv = CURRENT_TIME;
-        if (gfs2_assert_warn(sdp, gh->gh_flags & GL_ATIME) ||
-            gfs2_assert_warn(sdp, !(gh->gh_flags & GL_ASYNC)) ||
-            gfs2_assert_warn(sdp, gl->gl_ops == &gfs2_inode_glops))
-                return -EINVAL;
-        state = gh->gh_state;
-        flags = gh->gh_flags;
-        error = gfs2_glock_nq(gh);
-        if (error)
-                return error;
-        if (test_bit(SDF_NOATIME, &sdp->sd_flags) ||
-            (sdp->sd_vfs->s_flags & MS_RDONLY))
-                return 0;
-        if (tv.tv_sec - ip->i_inode.i_atime.tv_sec >= quantum) {
-                gfs2_glock_dq(gh);
-                gfs2_holder_reinit(LM_ST_EXCLUSIVE, gh->gh_flags & ~LM_FLAG_ANY,
-                                   gh);
-                error = gfs2_glock_nq(gh);
-                if (error)
-                        return error;
-                /* Verify that atime hasn't been updated while we were
-                   trying to get exclusive lock. */
-                tv = CURRENT_TIME;
-                if (tv.tv_sec - ip->i_inode.i_atime.tv_sec >= quantum) {
-                        struct buffer_head *dibh;
-                        struct gfs2_dinode *di;
-                        error = gfs2_trans_begin(sdp, RES_DINODE, 0);
-                        if (error == -EROFS)
-                                return 0;
-                        if (error)
-                                goto fail;
-                        error = gfs2_meta_inode_buffer(ip, &dibh);
-                        if (error)
-                                goto fail_end_trans;
-                        ip->i_inode.i_atime = tv;
-                        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-                        di = (struct gfs2_dinode *)dibh->b_data;
-                        di->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
-                        di->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec);
-                        brelse(dibh);
-                        gfs2_trans_end(sdp);
-                }
-                /* If someone else has asked for the glock,
-                   unlock and let them have it. Then reacquire
-                   in the original state. */
-                if (gfs2_glock_is_blocking(gl)) {
-                        gfs2_glock_dq(gh);
-                        gfs2_holder_reinit(state, flags, gh);
-                        return gfs2_glock_nq(gh);
-                }
-        }
-        return 0;
-fail_end_trans:
-        gfs2_trans_end(sdp);
-fail:
-        gfs2_glock_dq(gh);
-        return error;
-}
 static int
 __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
 {
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 58f9607d6a86..2d43f69610a0 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -91,9 +91,7 @@ int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
 int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
                   const struct gfs2_inode *ip);
 int gfs2_permission(struct inode *inode, int mask);
-int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to);
 int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len);
-int gfs2_glock_nq_atime(struct gfs2_holder *gh);
 int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
 struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
 void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 6c6af9f5e3ab..ad305854bdc6 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -18,6 +18,7 @@
 #include <linux/delay.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
+#include <linux/bio.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -584,7 +585,6 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
        memset(bh->b_data, 0, bh->b_size);
        set_buffer_uptodate(bh);
        clear_buffer_dirty(bh);
-        unlock_buffer(bh);
        gfs2_ail1_empty(sdp, 0);
        tail = current_tail(sdp);
@@ -601,8 +601,23 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
        hash = gfs2_disk_hash(bh->b_data, sizeof(struct gfs2_log_header));
        lh->lh_hash = cpu_to_be32(hash);
-        set_buffer_dirty(bh);
+        bh->b_end_io = end_buffer_write_sync;
-        if (sync_dirty_buffer(bh))
+        if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
+                goto skip_barrier;
+        get_bh(bh);
+        submit_bh(WRITE_BARRIER | (1 << BIO_RW_META), bh);
+        wait_on_buffer(bh);
+        if (buffer_eopnotsupp(bh)) {
+                clear_buffer_eopnotsupp(bh);
+                set_buffer_uptodate(bh);
+                set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
+                lock_buffer(bh);
+skip_barrier:
+                get_bh(bh);
+                submit_bh(WRITE_SYNC | (1 << BIO_RW_META), bh);
+                wait_on_buffer(bh);
+        }
+        if (!buffer_uptodate(bh))
                gfs2_io_error_bh(sdp, bh);
        brelse(bh);
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
index b941f9f9f958..df48333e6f01 100644
--- a/fs/gfs2/mount.c
+++ b/fs/gfs2/mount.c
@@ -42,6 +42,7 @@ enum {
        Opt_nosuiddir,
        Opt_data_writeback,
        Opt_data_ordered,
+        Opt_meta,
        Opt_err,
 };
@@ -66,6 +67,7 @@ static match_table_t tokens = {
        {Opt_nosuiddir, "nosuiddir"},
        {Opt_data_writeback, "data=writeback"},
        {Opt_data_ordered, "data=ordered"},
+        {Opt_meta, "meta"},
        {Opt_err, NULL}
 };
@@ -239,6 +241,11 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
                case Opt_data_ordered:
                        args->ar_data = GFS2_DATA_ORDERED;
                        break;
+                case Opt_meta:
+                        if (remount && args->ar_meta != 1)
+                                goto cant_remount;
+                        args->ar_meta = 1;
+                        break;
                case Opt_err:
                default:
                        fs_info(sdp, "unknown option: %s\n", o);
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index e64a1b04117a..27563816e1c5 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -512,8 +512,8 @@ static int gfs2_readpage(struct file *file, struct page *page)
        int error;
        unlock_page(page);
-        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
+        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
-        error = gfs2_glock_nq_atime(&gh);
+        error = gfs2_glock_nq(&gh);
        if (unlikely(error))
                goto out;
        error = AOP_TRUNCATED_PAGE;
@@ -594,8 +594,8 @@ static int gfs2_readpages(struct file *file, struct address_space *mapping,
        struct gfs2_holder gh;
        int ret;
-        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
+        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
-        ret = gfs2_glock_nq_atime(&gh);
+        ret = gfs2_glock_nq(&gh);
        if (unlikely(ret))
                goto out_uninit;
        if (!gfs2_is_stuffed(ip))
@@ -636,8 +636,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
        unsigned to = from + len;
        struct page *page;
-        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME, &ip->i_gh);
+        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
-        error = gfs2_glock_nq_atime(&ip->i_gh);
+        error = gfs2_glock_nq(&ip->i_gh);
        if (unlikely(error))
                goto out_uninit;
@@ -975,7 +975,7 @@ static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset)
        if (gfs2_is_stuffed(ip))
                return 0;
-        if (offset > i_size_read(&ip->i_inode))
+        if (offset >= i_size_read(&ip->i_inode))
                return 0;
        return 1;
 }
@@ -1000,8 +1000,8 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
         * unfortunately have the option of only flushing a range like
         * the VFS does.
         */
-        gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, GL_ATIME, &gh);
+        gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, &gh);
-        rv = gfs2_glock_nq_atime(&gh);
+        rv = gfs2_glock_nq(&gh);
        if (rv)
                return rv;
        rv = gfs2_ok_for_dio(ip, rw, offset);
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index e9a366d4411c..3a747f8e2188 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -89,8 +89,8 @@ static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir)
        u64 offset = file->f_pos;
        int error;
-        gfs2_holder_init(dip->i_gl, LM_ST_SHARED, GL_ATIME, &d_gh);
+        gfs2_holder_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
-        error = gfs2_glock_nq_atime(&d_gh);
+        error = gfs2_glock_nq(&d_gh);
        if (error) {
                gfs2_holder_uninit(&d_gh);
                return error;
@@ -153,8 +153,8 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
        int error;
        u32 fsflags;
-        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
+        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
-        error = gfs2_glock_nq_atime(&gh);
+        error = gfs2_glock_nq(&gh);
        if (error)
                return error;
@@ -351,8 +351,8 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
        struct gfs2_alloc *al;
        int ret;
-        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME, &gh);
+        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
-        ret = gfs2_glock_nq_atime(&gh);
+        ret = gfs2_glock_nq(&gh);
        if (ret)
                goto out;
@@ -434,8 +434,8 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
        struct gfs2_holder i_gh;
        int error;
-        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh);
+        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
-        error = gfs2_glock_nq_atime(&i_gh);
+        error = gfs2_glock_nq(&i_gh);
        if (error) {
                gfs2_holder_uninit(&i_gh);
                return error;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index b4d1d6490633..b117fcf2c4f5 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -40,6 +40,44 @@
 #define DO 0
 #define UNDO 1
+static const u32 gfs2_old_fs_formats[] = {
+        0
+};
+static const u32 gfs2_old_multihost_formats[] = {
+        0
+};
+/**
+ * gfs2_tune_init - Fill a gfs2_tune structure with default values
+ * @gt: tune
+ *
+ */
+static void gfs2_tune_init(struct gfs2_tune *gt)
+{
+        spin_lock_init(&gt->gt_spin);
+        gt->gt_demote_secs = 300;
+        gt->gt_incore_log_blocks = 1024;
+        gt->gt_log_flush_secs = 60;
+        gt->gt_recoverd_secs = 60;
+        gt->gt_logd_secs = 1;
+        gt->gt_quotad_secs = 5;
+        gt->gt_quota_simul_sync = 64;
+        gt->gt_quota_warn_period = 10;
+        gt->gt_quota_scale_num = 1;
+        gt->gt_quota_scale_den = 1;
+        gt->gt_quota_cache_secs = 300;
+        gt->gt_quota_quantum = 60;
+        gt->gt_new_files_jdata = 0;
+        gt->gt_max_readahead = 1 << 18;
+        gt->gt_stall_secs = 600;
+        gt->gt_complain_secs = 10;
+        gt->gt_statfs_quantum = 30;
+        gt->gt_statfs_slow = 0;
+}
 static struct gfs2_sbd *init_sbd(struct super_block *sb)
 {
        struct gfs2_sbd *sdp;
@@ -96,21 +134,271 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
        return sdp;
 }
-static void init_vfs(struct super_block *sb, unsigned noatime)
+/**
+ * gfs2_check_sb - Check superblock
+ * @sdp: the filesystem
+ * @sb: The superblock
+ * @silent: Don't print a message if the check fails
+ *
+ * Checks the version code of the FS is one that we understand how to
+ * read and that the sizes of the various on-disk structures have not
+ * changed.
+ */
+static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent)
 {
-        struct gfs2_sbd *sdp = sb->s_fs_info;
+        unsigned int x;
-        sb->s_magic = GFS2_MAGIC;
+        if (sb->sb_magic != GFS2_MAGIC ||
-        sb->s_op = &gfs2_super_ops;
+            sb->sb_type != GFS2_METATYPE_SB) {
-        sb->s_export_op = &gfs2_export_ops;
+                if (!silent)
-        sb->s_time_gran = 1;
+                        printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n");
-        sb->s_maxbytes = MAX_LFS_FILESIZE;
+                return -EINVAL;
+        }
+        /*  If format numbers match exactly, we're done.  */
+        if (sb->sb_fs_format == GFS2_FORMAT_FS &&
+            sb->sb_multihost_format == GFS2_FORMAT_MULTI)
+                return 0;
+        if (sb->sb_fs_format != GFS2_FORMAT_FS) {
+                for (x = 0; gfs2_old_fs_formats[x]; x++)
+                        if (gfs2_old_fs_formats[x] == sb->sb_fs_format)
+                                break;
+                if (!gfs2_old_fs_formats[x]) {
+                        printk(KERN_WARNING
+                               "GFS2: code version (%u, %u) is incompatible "
+                               "with ondisk format (%u, %u)\n",
+                               GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
+                               sb->sb_fs_format, sb->sb_multihost_format);
+                        printk(KERN_WARNING
+                               "GFS2: I don't know how to upgrade this FS\n");
+                        return -EINVAL;
+                }
+        }
+        if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) {
+                for (x = 0; gfs2_old_multihost_formats[x]; x++)
+                        if (gfs2_old_multihost_formats[x] ==
+                            sb->sb_multihost_format)
+                                break;
+                if (!gfs2_old_multihost_formats[x]) {
+                        printk(KERN_WARNING
+                               "GFS2: code version (%u, %u) is incompatible "
+                               "with ondisk format (%u, %u)\n",
+                               GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
+                               sb->sb_fs_format, sb->sb_multihost_format);
+                        printk(KERN_WARNING
+                               "GFS2: I don't know how to upgrade this FS\n");
+                        return -EINVAL;
+                }
+        }
+        if (!sdp->sd_args.ar_upgrade) {
+                printk(KERN_WARNING
+                       "GFS2: code version (%u, %u) is incompatible "
+                       "with ondisk format (%u, %u)\n",
+                       GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
+                       sb->sb_fs_format, sb->sb_multihost_format);
+                printk(KERN_INFO
+                       "GFS2: Use the \"upgrade\" mount option to upgrade "
+                       "the FS\n");
+                printk(KERN_INFO "GFS2: See the manual for more details\n");
+                return -EINVAL;
+        }
+        return 0;
+}
+static void end_bio_io_page(struct bio *bio, int error)
+{
+        struct page *page = bio->bi_private;
-        if (sb->s_flags & (MS_NOATIME | MS_NODIRATIME))
+        if (!error)
-                set_bit(noatime, &sdp->sd_flags);
+                SetPageUptodate(page);
+        else
+                printk(KERN_WARNING "gfs2: error %d reading superblock\n", error);
+        unlock_page(page);
+}
+static void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf)
+{
+        const struct gfs2_sb *str = buf;
+        sb->sb_magic = be32_to_cpu(str->sb_header.mh_magic);
+        sb->sb_type = be32_to_cpu(str->sb_header.mh_type);
+        sb->sb_format = be32_to_cpu(str->sb_header.mh_format);
+        sb->sb_fs_format = be32_to_cpu(str->sb_fs_format);
+        sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format);
+        sb->sb_bsize = be32_to_cpu(str->sb_bsize);
+        sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift);
+        sb->sb_master_dir.no_addr = be64_to_cpu(str->sb_master_dir.no_addr);
+        sb->sb_master_dir.no_formal_ino = be64_to_cpu(str->sb_master_dir.no_formal_ino);
+        sb->sb_root_dir.no_addr = be64_to_cpu(str->sb_root_dir.no_addr);
+        sb->sb_root_dir.no_formal_ino = be64_to_cpu(str->sb_root_dir.no_formal_ino);
+        memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN);
+        memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN);
+}
+/**
+ * gfs2_read_super - Read the gfs2 super block from disk
+ * @sdp: The GFS2 super block
+ * @sector: The location of the super block
+ * @error: The error code to return
+ *
+ * This uses the bio functions to read the super block from disk
+ * because we want to be 100% sure that we never read cached data.
+ * A super block is read twice only during each GFS2 mount and is
+ * never written to by the filesystem. The first time its read no
+ * locks are held, and the only details which are looked at are those
+ * relating to the locking protocol. Once locking is up and working,
+ * the sb is read again under the lock to establish the location of
+ * the master directory (contains pointers to journals etc) and the
+ * root directory.
+ *
+ * Returns: 0 on success or error
+ */
+static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
+{
+        struct super_block *sb = sdp->sd_vfs;
+        struct gfs2_sb *p;
+        struct page *page;
+        struct bio *bio;
+        page = alloc_page(GFP_NOFS);
+        if (unlikely(!page))
+                return -ENOBUFS;
+        ClearPageUptodate(page);
+        ClearPageDirty(page);
+        lock_page(page);
+        bio = bio_alloc(GFP_NOFS, 1);
+        if (unlikely(!bio)) {
+                __free_page(page);
+                return -ENOBUFS;
+        }
-        /* Don't let the VFS update atimes.  GFS2 handles this itself. */
+        bio->bi_sector = sector * (sb->s_blocksize >> 9);
-        sb->s_flags |= MS_NOATIME | MS_NODIRATIME;
+        bio->bi_bdev = sb->s_bdev;
+        bio_add_page(bio, page, PAGE_SIZE, 0);
+        bio->bi_end_io = end_bio_io_page;
+        bio->bi_private = page;
+        submit_bio(READ_SYNC | (1 << BIO_RW_META), bio);
+        wait_on_page_locked(page);
+        bio_put(bio);
+        if (!PageUptodate(page)) {
+                __free_page(page);
+                return -EIO;
+        }
+        p = kmap(page);
+        gfs2_sb_in(&sdp->sd_sb, p);
+        kunmap(page);
+        __free_page(page);
+        return 0;
+}
+/**
+ * gfs2_read_sb - Read super block
+ * @sdp: The GFS2 superblock
+ * @gl: the glock for the superblock (assumed to be held)
+ * @silent: Don't print message if mount fails
+ *
+ */
+static int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent)
+{
+        u32 hash_blocks, ind_blocks, leaf_blocks;
+        u32 tmp_blocks;
+        unsigned int x;
+        int error;
+        error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift);
+        if (error) {
+                if (!silent)
+                        fs_err(sdp, "can't read superblock\n");
+                return error;
+        }
+        error = gfs2_check_sb(sdp, &sdp->sd_sb, silent);
+        if (error)
+                return error;
+        sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
+                               GFS2_BASIC_BLOCK_SHIFT;
+        sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
+        sdp->sd_diptrs = (sdp->sd_sb.sb_bsize -
+                          sizeof(struct gfs2_dinode)) / sizeof(u64);
+        sdp->sd_inptrs = (sdp->sd_sb.sb_bsize -
+                          sizeof(struct gfs2_meta_header)) / sizeof(u64);
+        sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header);
+        sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2;
+        sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1;
+        sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(u64);
+        sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize -
+                                sizeof(struct gfs2_meta_header)) /
+                                sizeof(struct gfs2_quota_change);
+        /* Compute maximum reservation required to add a entry to a directory */
+        hash_blocks = DIV_ROUND_UP(sizeof(u64) * (1 << GFS2_DIR_MAX_DEPTH),
+                             sdp->sd_jbsize);
+        ind_blocks = 0;
+        for (tmp_blocks = hash_blocks; tmp_blocks > sdp->sd_diptrs;) {
+                tmp_blocks = DIV_ROUND_UP(tmp_blocks, sdp->sd_inptrs);
+                ind_blocks += tmp_blocks;
+        }
+        leaf_blocks = 2 + GFS2_DIR_MAX_DEPTH;
+        sdp->sd_max_dirres = hash_blocks + ind_blocks + leaf_blocks;
+        sdp->sd_heightsize[0] = sdp->sd_sb.sb_bsize -
+                                sizeof(struct gfs2_dinode);
+        sdp->sd_heightsize[1] = sdp->sd_sb.sb_bsize * sdp->sd_diptrs;
+        for (x = 2;; x++) {
+                u64 space, d;
+                u32 m;
+                space = sdp->sd_heightsize[x - 1] * sdp->sd_inptrs;
+                d = space;
+                m = do_div(d, sdp->sd_inptrs);
+                if (d != sdp->sd_heightsize[x - 1] || m)
+                        break;
+                sdp->sd_heightsize[x] = space;
+        }
+        sdp->sd_max_height = x;
+        sdp->sd_heightsize[x] = ~0;
+        gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT);
+        sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize -
+                                 sizeof(struct gfs2_dinode);
+        sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs;
+        for (x = 2;; x++) {
+                u64 space, d;
+                u32 m;
+                space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs;
+                d = space;
+                m = do_div(d, sdp->sd_inptrs);
+                if (d != sdp->sd_jheightsize[x - 1] || m)
+                        break;
+                sdp->sd_jheightsize[x] = space;
+        }
+        sdp->sd_max_jheight = x;
+        sdp->sd_jheightsize[x] = ~0;
+        gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
+        return 0;
 }
 static int init_names(struct gfs2_sbd *sdp, int silent)
@@ -224,51 +512,59 @@ fail:
        return error;
 }
-static inline struct inode *gfs2_lookup_root(struct super_block *sb,
+static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr,
-                                             u64 no_addr)
+                            u64 no_addr, const char *name)
 {
-        return gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0);
+        struct gfs2_sbd *sdp = sb->s_fs_info;
+        struct dentry *dentry;
+        struct inode *inode;
+        inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0);
+        if (IS_ERR(inode)) {
+                fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode));
+                return PTR_ERR(inode);
+        }
+        dentry = d_alloc_root(inode);
+        if (!dentry) {
+                fs_err(sdp, "can't alloc %s dentry\n", name);
+                iput(inode);
+                return -ENOMEM;
+        }
+        dentry->d_op = &gfs2_dops;
+        *dptr = dentry;
+        return 0;
 }
-static int init_sb(struct gfs2_sbd *sdp, int silent, int undo)
+static int init_sb(struct gfs2_sbd *sdp, int silent)
 {
        struct super_block *sb = sdp->sd_vfs;
        struct gfs2_holder sb_gh;
        u64 no_addr;
-        struct inode *inode;
+        int ret;
-        int error = 0;
-        if (undo) {
+        ret = gfs2_glock_nq_num(sdp, GFS2_SB_LOCK, &gfs2_meta_glops,
-                if (sb->s_root) {
+                                LM_ST_SHARED, 0, &sb_gh);
-                        dput(sb->s_root);
+        if (ret) {
-                        sb->s_root = NULL;
+                fs_err(sdp, "can't acquire superblock glock: %d\n", ret);
-                }
+                return ret;
-                return 0;
        }
-        error = gfs2_glock_nq_num(sdp, GFS2_SB_LOCK, &gfs2_meta_glops,
+        ret = gfs2_read_sb(sdp, sb_gh.gh_gl, silent);
-                                 LM_ST_SHARED, 0, &sb_gh);
+        if (ret) {
-        if (error) {
+                fs_err(sdp, "can't read superblock: %d\n", ret);
-                fs_err(sdp, "can't acquire superblock glock: %d\n", error);
-                return error;
-        }
-        error = gfs2_read_sb(sdp, sb_gh.gh_gl, silent);
-        if (error) {
-                fs_err(sdp, "can't read superblock: %d\n", error);
                goto out;
        }
        /* Set up the buffer cache and SB for real */
        if (sdp->sd_sb.sb_bsize < bdev_hardsect_size(sb->s_bdev)) {
-                error = -EINVAL;
+                ret = -EINVAL;
                fs_err(sdp, "FS block size (%u) is too small for device "
                       "block size (%u)\n",
                       sdp->sd_sb.sb_bsize, bdev_hardsect_size(sb->s_bdev));
                goto out;
        }
        if (sdp->sd_sb.sb_bsize > PAGE_SIZE) {
-                error = -EINVAL;
+                ret = -EINVAL;
                fs_err(sdp, "FS block size (%u) is too big for machine "
                       "page size (%u)\n",
                       sdp->sd_sb.sb_bsize, (unsigned int)PAGE_SIZE);
@@ -278,26 +574,21 @@ static int init_sb(struct gfs2_sbd *sdp, int silent, int undo)
        /* Get the root inode */
        no_addr = sdp->sd_sb.sb_root_dir.no_addr;
-        if (sb->s_type == &gfs2meta_fs_type)
+        ret = gfs2_lookup_root(sb, &sdp->sd_root_dir, no_addr, "root");
-                no_addr = sdp->sd_sb.sb_master_dir.no_addr;
+        if (ret)
-        inode = gfs2_lookup_root(sb, no_addr);
-        if (IS_ERR(inode)) {
-                error = PTR_ERR(inode);
-                fs_err(sdp, "can't read in root inode: %d\n", error);
                goto out;
-        }
-        sb->s_root = d_alloc_root(inode);
+        /* Get the master inode */
-        if (!sb->s_root) {
+        no_addr = sdp->sd_sb.sb_master_dir.no_addr;
-                fs_err(sdp, "can't get root dentry\n");
+        ret = gfs2_lookup_root(sb, &sdp->sd_master_dir, no_addr, "master");
-                error = -ENOMEM;
+        if (ret) {
-                iput(inode);
+                dput(sdp->sd_root_dir);
-        } else
+                goto out;
-                sb->s_root->d_op = &gfs2_dops;
+        }
-        
+        sb->s_root = dget(sdp->sd_args.ar_meta ? sdp->sd_master_dir : sdp->sd_root_dir);
 out:
        gfs2_glock_dq_uninit(&sb_gh);
-        return error;
+        return ret;
 }
 /**
@@ -372,6 +663,7 @@ static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
 static int init_journal(struct gfs2_sbd *sdp, int undo)
 {
+        struct inode *master = sdp->sd_master_dir->d_inode;
        struct gfs2_holder ji_gh;
        struct task_struct *p;
        struct gfs2_inode *ip;
@@ -383,7 +675,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
                goto fail_recoverd;
        }
-        sdp->sd_jindex = gfs2_lookup_simple(sdp->sd_master_dir, "jindex");
+        sdp->sd_jindex = gfs2_lookup_simple(master, "jindex");
        if (IS_ERR(sdp->sd_jindex)) {
                fs_err(sdp, "can't lookup journal index: %d\n", error);
                return PTR_ERR(sdp->sd_jindex);
@@ -506,25 +798,17 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
 {
        int error = 0;
        struct gfs2_inode *ip;
-        struct inode *inode;
+        struct inode *master = sdp->sd_master_dir->d_inode;
        if (undo)
                goto fail_qinode;
-        inode = gfs2_lookup_root(sdp->sd_vfs, sdp->sd_sb.sb_master_dir.no_addr);
-        if (IS_ERR(inode)) {
-                error = PTR_ERR(inode);
-                fs_err(sdp, "can't read in master directory: %d\n", error);
-                goto fail;
-        }
-        sdp->sd_master_dir = inode;
        error = init_journal(sdp, undo);
        if (error)
-                goto fail_master;
+                goto fail;
        /* Read in the master inode number inode */
-        sdp->sd_inum_inode = gfs2_lookup_simple(sdp->sd_master_dir, "inum");
+        sdp->sd_inum_inode = gfs2_lookup_simple(master, "inum");
        if (IS_ERR(sdp->sd_inum_inode)) {
                error = PTR_ERR(sdp->sd_inum_inode);
                fs_err(sdp, "can't read in inum inode: %d\n", error);
@@ -533,7 +817,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
        /* Read in the master statfs inode */
-        sdp->sd_statfs_inode = gfs2_lookup_simple(sdp->sd_master_dir, "statfs");
+        sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs");
        if (IS_ERR(sdp->sd_statfs_inode)) {
                error = PTR_ERR(sdp->sd_statfs_inode);
                fs_err(sdp, "can't read in statfs inode: %d\n", error);
@@ -541,7 +825,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
        }
        /* Read in the resource index inode */
-        sdp->sd_rindex = gfs2_lookup_simple(sdp->sd_master_dir, "rindex");
+        sdp->sd_rindex = gfs2_lookup_simple(master, "rindex");
        if (IS_ERR(sdp->sd_rindex)) {
                error = PTR_ERR(sdp->sd_rindex);
                fs_err(sdp, "can't get resource index inode: %d\n", error);
@@ -552,7 +836,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
        sdp->sd_rindex_uptodate = 0;
        /* Read in the quota inode */
-        sdp->sd_quota_inode = gfs2_lookup_simple(sdp->sd_master_dir, "quota");
+        sdp->sd_quota_inode = gfs2_lookup_simple(master, "quota");
        if (IS_ERR(sdp->sd_quota_inode)) {
                error = PTR_ERR(sdp->sd_quota_inode);
                fs_err(sdp, "can't get quota file inode: %d\n", error);
@@ -571,8 +855,6 @@ fail_inum:
        iput(sdp->sd_inum_inode);
 fail_journal:
        init_journal(sdp, UNDO);
-fail_master:
-        iput(sdp->sd_master_dir);
 fail:
        return error;
 }
@@ -583,6 +865,7 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
        char buf[30];
        int error = 0;
        struct gfs2_inode *ip;
+        struct inode *master = sdp->sd_master_dir->d_inode;
        if (sdp->sd_args.ar_spectator)
                return 0;
@@ -590,7 +873,7 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
        if (undo)
                goto fail_qc_gh;
-        pn = gfs2_lookup_simple(sdp->sd_master_dir, "per_node");
+        pn = gfs2_lookup_simple(master, "per_node");
        if (IS_ERR(pn)) {
                error = PTR_ERR(pn);
                fs_err(sdp, "can't find per_node directory: %d\n", error);
@@ -800,7 +1083,11 @@ static int fill_super(struct super_block *sb, void *data, int silent)
                goto fail;
        }
-        init_vfs(sb, SDF_NOATIME);
+        sb->s_magic = GFS2_MAGIC;
+        sb->s_op = &gfs2_super_ops;
+        sb->s_export_op = &gfs2_export_ops;
+        sb->s_time_gran = 1;
+        sb->s_maxbytes = MAX_LFS_FILESIZE;
        /* Set up the buffer cache and fill in some fake block size values
           to allow us to read-in the on-disk superblock. */
@@ -828,7 +1115,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
        if (error)
                goto fail_lm;
-        error = init_sb(sdp, silent, DO);
+        error = init_sb(sdp, silent);
        if (error)
                goto fail_locking;
@@ -869,7 +1156,11 @@ fail_per_node:
 fail_inodes:
        init_inodes(sdp, UNDO);
 fail_sb:
-        init_sb(sdp, 0, UNDO);
+        if (sdp->sd_root_dir)
+                dput(sdp->sd_root_dir);
+        if (sdp->sd_master_dir)
+                dput(sdp->sd_master_dir);
+        sb->s_root = NULL;
 fail_locking:
        init_locking(sdp, &mount_gh, UNDO);
 fail_lm:
@@ -887,151 +1178,63 @@ fail:
 }
 static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
-                const char *dev_name, void *data, struct vfsmount *mnt)
+                       const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        struct super_block *sb;
+        return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
-        struct gfs2_sbd *sdp;
-        int error = get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
-        if (error)
-                goto out;
-        sb = mnt->mnt_sb;
-        sdp = sb->s_fs_info;
-        sdp->sd_gfs2mnt = mnt;
-out:
-        return error;
 }
-static int fill_super_meta(struct super_block *sb, struct super_block *new,
+static struct super_block *get_gfs2_sb(const char *dev_name)
-                           void *data, int silent)
 {
-        struct gfs2_sbd *sdp = sb->s_fs_info;
+        struct super_block *sb;
-        struct inode *inode;
-        int error = 0;
-        new->s_fs_info = sdp;
-        sdp->sd_vfs_meta = sb;
-        init_vfs(new, SDF_NOATIME);
-        /* Get the master inode */
-        inode = igrab(sdp->sd_master_dir);
-        new->s_root = d_alloc_root(inode);
-        if (!new->s_root) {
-                fs_err(sdp, "can't get root dentry\n");
-                error = -ENOMEM;
-                iput(inode);
-        } else
-                new->s_root->d_op = &gfs2_dops;
-        return error;
-}
-static int set_bdev_super(struct super_block *s, void *data)
-{
-        s->s_bdev = data;
-        s->s_dev = s->s_bdev->bd_dev;
-        return 0;
-}
-static int test_bdev_super(struct super_block *s, void *data)
-{
-        return s->s_bdev == data;
-}
-static struct super_block* get_gfs2_sb(const char *dev_name)
-{
-        struct kstat stat;
        struct nameidata nd;
-        struct super_block *sb = NULL, *s;
        int error;
        error = path_lookup(dev_name, LOOKUP_FOLLOW, &nd);
        if (error) {
-                printk(KERN_WARNING "GFS2: path_lookup on %s returned error\n",
+                printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n",
-                       dev_name);
+                       dev_name, error);
-                goto out;
+                return NULL;
-        }
-        error = vfs_getattr(nd.path.mnt, nd.path.dentry, &stat);
-        list_for_each_entry(s, &gfs2_fs_type.fs_supers, s_instances) {
-                if ((S_ISBLK(stat.mode) && s->s_dev == stat.rdev) ||
-                    (S_ISDIR(stat.mode) &&
-                     s == nd.path.dentry->d_inode->i_sb)) {
-                        sb = s;
-                        goto free_nd;
-                }
        }
+        sb = nd.path.dentry->d_inode->i_sb;
-        printk(KERN_WARNING "GFS2: Unrecognized block device or "
+        if (sb && (sb->s_type == &gfs2_fs_type))
-               "mount point %s\n", dev_name);
+                atomic_inc(&sb->s_active);
+        else
-free_nd:
+                sb = NULL;
        path_put(&nd.path);
-out:
        return sb;
 }
 static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
                            const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        int error = 0;
+        struct super_block *sb = NULL;
-        struct super_block *sb = NULL, *new;
        struct gfs2_sbd *sdp;
        sb = get_gfs2_sb(dev_name);
        if (!sb) {
                printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n");
-                error = -ENOENT;
+                return -ENOENT;
-                goto error;
        }
        sdp = sb->s_fs_info;
-        if (sdp->sd_vfs_meta) {
+        mnt->mnt_sb = sb;
-                printk(KERN_WARNING "GFS2: gfs2meta mount already exists\n");
+        mnt->mnt_root = dget(sdp->sd_master_dir);
-                error = -EBUSY;
+        return 0;
-                goto error;
-        }
-        down(&sb->s_bdev->bd_mount_sem);
-        new = sget(fs_type, test_bdev_super, set_bdev_super, sb->s_bdev);
-        up(&sb->s_bdev->bd_mount_sem);
-        if (IS_ERR(new)) {
-                error = PTR_ERR(new);
-                goto error;
-        }
-        new->s_flags = flags;
-        strlcpy(new->s_id, sb->s_id, sizeof(new->s_id));
-        sb_set_blocksize(new, sb->s_blocksize);
-        error = fill_super_meta(sb, new, data, flags & MS_SILENT ? 1 : 0);
-        if (error) {
-                up_write(&new->s_umount);
-                deactivate_super(new);
-                goto error;
-        }
-        new->s_flags |= MS_ACTIVE;
-        /* Grab a reference to the gfs2 mount point */
-        atomic_inc(&sdp->sd_gfs2mnt->mnt_count);
-        return simple_set_mnt(mnt, new);
-error:
-        return error;
 }
 static void gfs2_kill_sb(struct super_block *sb)
 {
-        if (sb->s_fs_info) {
+        struct gfs2_sbd *sdp = sb->s_fs_info;
-                gfs2_delete_debugfs_file(sb->s_fs_info);
+        if (sdp) {
-                gfs2_meta_syncfs(sb->s_fs_info);
+                gfs2_meta_syncfs(sdp);
+                dput(sdp->sd_root_dir);
+                dput(sdp->sd_master_dir);
+                sdp->sd_root_dir = NULL;
+                sdp->sd_master_dir = NULL;
        }
+        shrink_dcache_sb(sb);
        kill_block_super(sb);
-}
+        if (sdp)
+                gfs2_delete_debugfs_file(sdp);
-static void gfs2_kill_sb_meta(struct super_block *sb)
-{
-        struct gfs2_sbd *sdp = sb->s_fs_info;
-        generic_shutdown_super(sb);
-        sdp->sd_vfs_meta = NULL;
-        atomic_dec(&sdp->sd_gfs2mnt->mnt_count);
 }
 struct file_system_type gfs2_fs_type = {
@@ -1046,7 +1249,6 @@ struct file_system_type gfs2meta_fs_type = {
        .name = "gfs2meta",
        .fs_flags = FS_REQUIRES_DEV,
        .get_sb = gfs2_get_sb_meta,
-        .kill_sb = gfs2_kill_sb_meta,
        .owner = THIS_MODULE,
 };
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index e2c62f73a778..534e1e2c65ca 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -159,9 +159,13 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
        gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
-        error = gfs2_glock_nq_m(2, ghs);
+        error = gfs2_glock_nq(ghs); /* parent */
        if (error)
-                goto out;
+                goto out_parent;
+        error = gfs2_glock_nq(ghs + 1); /* child */
+        if (error)
+                goto out_child;
        error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC);
        if (error)
@@ -245,8 +249,10 @@ out_alloc:
        if (alloc_required)
                gfs2_alloc_put(dip);
 out_gunlock:
-        gfs2_glock_dq_m(2, ghs);
+        gfs2_glock_dq(ghs + 1);
-out:
+out_child:
+        gfs2_glock_dq(ghs);
+out_parent:
        gfs2_holder_uninit(ghs);
        gfs2_holder_uninit(ghs + 1);
        if (!error) {
@@ -302,7 +308,7 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
        error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
        if (error)
-                goto out_rgrp;
+                goto out_gunlock;
        error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0);
        if (error)
@@ -316,6 +322,7 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
 out_end_trans:
        gfs2_trans_end(sdp);
+out_gunlock:
        gfs2_glock_dq(ghs + 2);
 out_rgrp:
        gfs2_holder_uninit(ghs + 2);
@@ -485,7 +492,6 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
        struct gfs2_holder ri_gh;
        int error;
        error = gfs2_rindex_hold(sdp, &ri_gh);
        if (error)
                return error;
@@ -495,9 +501,17 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
        rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr);
        gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2);
-        error = gfs2_glock_nq_m(3, ghs);
+        error = gfs2_glock_nq(ghs); /* parent */
        if (error)
-                goto out;
+                goto out_parent;
+        error = gfs2_glock_nq(ghs + 1); /* child */
+        if (error)
+                goto out_child;
+        error = gfs2_glock_nq(ghs + 2); /* rgrp */
+        if (error)
+                goto out_rgrp;
        error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
        if (error)
@@ -523,11 +537,15 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
        gfs2_trans_end(sdp);
 out_gunlock:
-        gfs2_glock_dq_m(3, ghs);
+        gfs2_glock_dq(ghs + 2);
-out:
+out_rgrp:
-        gfs2_holder_uninit(ghs);
-        gfs2_holder_uninit(ghs + 1);
        gfs2_holder_uninit(ghs + 2);
+        gfs2_glock_dq(ghs + 1);
+out_child:
+        gfs2_holder_uninit(ghs + 1);
+        gfs2_glock_dq(ghs);
+out_parent:
+        gfs2_holder_uninit(ghs);
        gfs2_glock_dq_uninit(&ri_gh);
        return error;
 }
@@ -571,6 +589,54 @@ static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode,
        return 0;
 }
+/*
+ * gfs2_ok_to_move - check if it's ok to move a directory to another directory
+ * @this: move this
+ * @to: to here
+ *
+ * Follow @to back to the root and make sure we don't encounter @this
+ * Assumes we already hold the rename lock.
+ *
+ * Returns: errno
+ */
+static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
+{
+        struct inode *dir = &to->i_inode;
+        struct super_block *sb = dir->i_sb;
+        struct inode *tmp;
+        struct qstr dotdot;
+        int error = 0;
+        gfs2_str2qstr(&dotdot, "..");
+        igrab(dir);
+        for (;;) {
+                if (dir == &this->i_inode) {
+                        error = -EINVAL;
+                        break;
+                }
+                if (dir == sb->s_root->d_inode) {
+                        error = 0;
+                        break;
+                }
+                tmp = gfs2_lookupi(dir, &dotdot, 1);
+                if (IS_ERR(tmp)) {
+                        error = PTR_ERR(tmp);
+                        break;
+                }
+                iput(dir);
+                dir = tmp;
+        }
+        iput(dir);
+        return error;
+}
 /**
 * gfs2_rename - Rename a file
 * @odir: Parent directory of old file name
@@ -589,7 +655,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
        struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
        struct gfs2_inode *nip = NULL;
        struct gfs2_sbd *sdp = GFS2_SB(odir);
-        struct gfs2_holder ghs[5], r_gh;
+        struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, };
        struct gfs2_rgrpd *nrgd;
        unsigned int num_gh;
        int dir_rename = 0;
@@ -603,19 +669,20 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
                        return 0;
        }
-        /* Make sure we aren't trying to move a dirctory into it's subdir */
-        if (S_ISDIR(ip->i_inode.i_mode) && odip != ndip) {
-                dir_rename = 1;
-                error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE, 0,
+        if (odip != ndip) {
-                                           &r_gh);
+                error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE,
+                                           0, &r_gh);
                if (error)
                        goto out;
-                error = gfs2_ok_to_move(ip, ndip);
+                if (S_ISDIR(ip->i_inode.i_mode)) {
-                if (error)
+                        dir_rename = 1;
-                        goto out_gunlock_r;
+                        /* don't move a dirctory into it's subdir */
+                        error = gfs2_ok_to_move(ip, ndip);
+                        if (error)
+                                goto out_gunlock_r;
+                }
        }
        num_gh = 1;
@@ -639,9 +706,11 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
                        gfs2_holder_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh++);
        }
-        error = gfs2_glock_nq_m(num_gh, ghs);
+        for (x = 0; x < num_gh; x++) {
-        if (error)
+                error = gfs2_glock_nq(ghs + x);
-                goto out_uninit;
+                if (error)
+                        goto out_gunlock;
+        }
        /* Check out the old directory */
@@ -804,12 +873,12 @@ out_alloc:
        if (alloc_required)
                gfs2_alloc_put(ndip);
 out_gunlock:
-        gfs2_glock_dq_m(num_gh, ghs);
+        while (x--) {
-out_uninit:
+                gfs2_glock_dq(ghs + x);
-        for (x = 0; x < num_gh; x++)
                gfs2_holder_uninit(ghs + x);
+        }
 out_gunlock_r:
-        if (dir_rename)
+        if (r_gh.gh_gl)
                gfs2_glock_dq_uninit(&r_gh);
 out:
        return error;
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index f66ea0f7a356..d5355d9b5926 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -20,6 +20,7 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
 #include <linux/lm_interface.h>
+#include <linux/time.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -38,6 +39,7 @@
 #include "dir.h"
 #include "eattr.h"
 #include "bmap.h"
+#include "meta_io.h"
 /**
 * gfs2_write_inode - Make sure the inode is stable on the disk
@@ -50,16 +52,74 @@
 static int gfs2_write_inode(struct inode *inode, int sync)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
-        /* Check this is a "normal" inode */
+        struct gfs2_holder gh;
-        if (test_bit(GIF_USER, &ip->i_flags)) {
+        struct buffer_head *bh;
-                if (current->flags & PF_MEMALLOC)
+        struct timespec atime;
-                        return 0;
+        struct gfs2_dinode *di;
-                if (sync)
+        int ret = 0;
-                        gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
+        /* Check this is a "normal" inode, etc */
+        if (!test_bit(GIF_USER, &ip->i_flags) ||
+            (current->flags & PF_MEMALLOC))
+                return 0;
+        ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+        if (ret)
+                goto do_flush;
+        ret = gfs2_trans_begin(sdp, RES_DINODE, 0);
+        if (ret)
+                goto do_unlock;
+        ret = gfs2_meta_inode_buffer(ip, &bh);
+        if (ret == 0) {
+                di = (struct gfs2_dinode *)bh->b_data;
+                atime.tv_sec = be64_to_cpu(di->di_atime);
+                atime.tv_nsec = be32_to_cpu(di->di_atime_nsec);
+                if (timespec_compare(&inode->i_atime, &atime) > 0) {
+                        gfs2_trans_add_bh(ip->i_gl, bh, 1);
+                        gfs2_dinode_out(ip, bh->b_data);
+                }
+                brelse(bh);
        }
+        gfs2_trans_end(sdp);
+do_unlock:
+        gfs2_glock_dq_uninit(&gh);
+do_flush:
+        if (sync != 0)
+                gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
+        return ret;
+}
-        return 0;
+/**
+ * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
+ * @sdp: the filesystem
+ *
+ * Returns: errno
+ */
+static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
+{
+        struct gfs2_holder t_gh;
+        int error;
+        gfs2_quota_sync(sdp);
+        gfs2_statfs_sync(sdp);
+        error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
+                                   &t_gh);
+        if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+                return error;
+        gfs2_meta_syncfs(sdp);
+        gfs2_log_shutdown(sdp);
+        clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
+        if (t_gh.gh_gl)
+                gfs2_glock_dq_uninit(&t_gh);
+        gfs2_quota_cleanup(sdp);
+        return error;
 }
 /**
@@ -73,12 +133,6 @@ static void gfs2_put_super(struct super_block *sb)
        struct gfs2_sbd *sdp = sb->s_fs_info;
        int error;
-        if (!sdp)
-                return;
-        if (!strncmp(sb->s_type->name, "gfs2meta", 8))
-                return; /* Nothing to do */
        /*  Unfreeze the filesystem, if we need to  */
        mutex_lock(&sdp->sd_freeze_lock);
@@ -101,7 +155,6 @@ static void gfs2_put_super(struct super_block *sb)
        /*  Release stuff  */
-        iput(sdp->sd_master_dir);
        iput(sdp->sd_jindex);
        iput(sdp->sd_inum_inode);
        iput(sdp->sd_statfs_inode);
@@ -152,6 +205,7 @@ static void gfs2_write_super(struct super_block *sb)
 *
 * Flushes the log to disk.
 */
 static int gfs2_sync_fs(struct super_block *sb, int wait)
 {
        sb->s_dirt = 0;
@@ -270,14 +324,6 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
                }
        }
-        if (*flags & (MS_NOATIME | MS_NODIRATIME))
-                set_bit(SDF_NOATIME, &sdp->sd_flags);
-        else
-                clear_bit(SDF_NOATIME, &sdp->sd_flags);
-        /* Don't let the VFS update atimes.  GFS2 handles this itself. */
-        *flags |= MS_NOATIME | MS_NODIRATIME;
        return error;
 }
@@ -295,6 +341,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
 * inode's blocks, or alternatively pass the baton on to another
 * node for later deallocation.
 */
 static void gfs2_drop_inode(struct inode *inode)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
@@ -333,6 +380,16 @@ static void gfs2_clear_inode(struct inode *inode)
        }
 }
+static int is_ancestor(const struct dentry *d1, const struct dentry *d2)
+{
+        do {
+                if (d1 == d2)
+                        return 1;
+                d1 = d1->d_parent;
+        } while (!IS_ROOT(d1));
+        return 0;
+}
 /**
 * gfs2_show_options - Show mount options for /proc/mounts
 * @s: seq_file structure
@@ -346,6 +403,8 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
        struct gfs2_args *args = &sdp->sd_args;
+        if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir))
+                seq_printf(s, ",meta");
        if (args->ar_lockproto[0])
                seq_printf(s, ",lockproto=%s", args->ar_lockproto);
        if (args->ar_locktable[0])
@@ -414,6 +473,7 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 * conversion on the iopen lock, but we can change that later. This
 * is safe, just less efficient.
 */
 static void gfs2_delete_inode(struct inode *inode)
 {
        struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
@@ -478,8 +538,6 @@ out:
        clear_inode(inode);
 }
 static struct inode *gfs2_alloc_inode(struct super_block *sb)
 {
        struct gfs2_inode *ip;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index ca831991cbc2..c3ba3d9d0aac 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -33,313 +33,6 @@
 #include "trans.h"
 #include "util.h"
-static const u32 gfs2_old_fs_formats[] = {
-        0
-};
-static const u32 gfs2_old_multihost_formats[] = {
-        0
-};
-/**
- * gfs2_tune_init - Fill a gfs2_tune structure with default values
- * @gt: tune
- *
- */
-void gfs2_tune_init(struct gfs2_tune *gt)
-{
-        spin_lock_init(&gt->gt_spin);
-        gt->gt_demote_secs = 300;
-        gt->gt_incore_log_blocks = 1024;
-        gt->gt_log_flush_secs = 60;
-        gt->gt_recoverd_secs = 60;
-        gt->gt_logd_secs = 1;
-        gt->gt_quotad_secs = 5;
-        gt->gt_quota_simul_sync = 64;
-        gt->gt_quota_warn_period = 10;
-        gt->gt_quota_scale_num = 1;
-        gt->gt_quota_scale_den = 1;
-        gt->gt_quota_cache_secs = 300;
-        gt->gt_quota_quantum = 60;
-        gt->gt_atime_quantum = 3600;
-        gt->gt_new_files_jdata = 0;
-        gt->gt_max_readahead = 1 << 18;
-        gt->gt_stall_secs = 600;
-        gt->gt_complain_secs = 10;
-        gt->gt_statfs_quantum = 30;
-        gt->gt_statfs_slow = 0;
-}
-/**
- * gfs2_check_sb - Check superblock
- * @sdp: the filesystem
- * @sb: The superblock
- * @silent: Don't print a message if the check fails
- *
- * Checks the version code of the FS is one that we understand how to
- * read and that the sizes of the various on-disk structures have not
- * changed.
- */
-int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent)
-{
-        unsigned int x;
-        if (sb->sb_magic != GFS2_MAGIC ||
-            sb->sb_type != GFS2_METATYPE_SB) {
-                if (!silent)
-                        printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n");
-                return -EINVAL;
-        }
-        /*  If format numbers match exactly, we're done.  */
-        if (sb->sb_fs_format == GFS2_FORMAT_FS &&
-            sb->sb_multihost_format == GFS2_FORMAT_MULTI)
-                return 0;
-        if (sb->sb_fs_format != GFS2_FORMAT_FS) {
-                for (x = 0; gfs2_old_fs_formats[x]; x++)
-                        if (gfs2_old_fs_formats[x] == sb->sb_fs_format)
-                                break;
-                if (!gfs2_old_fs_formats[x]) {
-                        printk(KERN_WARNING
-                               "GFS2: code version (%u, %u) is incompatible "
-                               "with ondisk format (%u, %u)\n",
-                               GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
-                               sb->sb_fs_format, sb->sb_multihost_format);
-                        printk(KERN_WARNING
-                               "GFS2: I don't know how to upgrade this FS\n");
-                        return -EINVAL;
-                }
-        }
-        if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) {
-                for (x = 0; gfs2_old_multihost_formats[x]; x++)
-                        if (gfs2_old_multihost_formats[x] ==
-                            sb->sb_multihost_format)
-                                break;
-                if (!gfs2_old_multihost_formats[x]) {
-                        printk(KERN_WARNING
-                               "GFS2: code version (%u, %u) is incompatible "
-                               "with ondisk format (%u, %u)\n",
-                               GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
-                               sb->sb_fs_format, sb->sb_multihost_format);
-                        printk(KERN_WARNING
-                               "GFS2: I don't know how to upgrade this FS\n");
-                        return -EINVAL;
-                }
-        }
-        if (!sdp->sd_args.ar_upgrade) {
-                printk(KERN_WARNING
-                       "GFS2: code version (%u, %u) is incompatible "
-                       "with ondisk format (%u, %u)\n",
-                       GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
-                       sb->sb_fs_format, sb->sb_multihost_format);
-                printk(KERN_INFO
-                       "GFS2: Use the \"upgrade\" mount option to upgrade "
-                       "the FS\n");
-                printk(KERN_INFO "GFS2: See the manual for more details\n");
-                return -EINVAL;
-        }
-        return 0;
-}
-static void end_bio_io_page(struct bio *bio, int error)
-{
-        struct page *page = bio->bi_private;
-        if (!error)
-                SetPageUptodate(page);
-        else
-                printk(KERN_WARNING "gfs2: error %d reading superblock\n", error);
-        unlock_page(page);
-}
-static void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf)
-{
-        const struct gfs2_sb *str = buf;
-        sb->sb_magic = be32_to_cpu(str->sb_header.mh_magic);
-        sb->sb_type = be32_to_cpu(str->sb_header.mh_type);
-        sb->sb_format = be32_to_cpu(str->sb_header.mh_format);
-        sb->sb_fs_format = be32_to_cpu(str->sb_fs_format);
-        sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format);
-        sb->sb_bsize = be32_to_cpu(str->sb_bsize);
-        sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift);
-        sb->sb_master_dir.no_addr = be64_to_cpu(str->sb_master_dir.no_addr);
-        sb->sb_master_dir.no_formal_ino = be64_to_cpu(str->sb_master_dir.no_formal_ino);
-        sb->sb_root_dir.no_addr = be64_to_cpu(str->sb_root_dir.no_addr);
-        sb->sb_root_dir.no_formal_ino = be64_to_cpu(str->sb_root_dir.no_formal_ino);
-        memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN);
-        memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN);
-}
-/**
- * gfs2_read_super - Read the gfs2 super block from disk
- * @sdp: The GFS2 super block
- * @sector: The location of the super block
- * @error: The error code to return
- *
- * This uses the bio functions to read the super block from disk
- * because we want to be 100% sure that we never read cached data.
- * A super block is read twice only during each GFS2 mount and is
- * never written to by the filesystem. The first time its read no
- * locks are held, and the only details which are looked at are those
- * relating to the locking protocol. Once locking is up and working,
- * the sb is read again under the lock to establish the location of
- * the master directory (contains pointers to journals etc) and the
- * root directory.
- *
- * Returns: 0 on success or error
- */
-int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
-{
-        struct super_block *sb = sdp->sd_vfs;
-        struct gfs2_sb *p;
-        struct page *page;
-        struct bio *bio;
-        page = alloc_page(GFP_NOFS);
-        if (unlikely(!page))
-                return -ENOBUFS;
-        ClearPageUptodate(page);
-        ClearPageDirty(page);
-        lock_page(page);
-        bio = bio_alloc(GFP_NOFS, 1);
-        if (unlikely(!bio)) {
-                __free_page(page);
-                return -ENOBUFS;
-        }
-        bio->bi_sector = sector * (sb->s_blocksize >> 9);
-        bio->bi_bdev = sb->s_bdev;
-        bio_add_page(bio, page, PAGE_SIZE, 0);
-        bio->bi_end_io = end_bio_io_page;
-        bio->bi_private = page;
-        submit_bio(READ_SYNC | (1 << BIO_RW_META), bio);
-        wait_on_page_locked(page);
-        bio_put(bio);
-        if (!PageUptodate(page)) {
-                __free_page(page);
-                return -EIO;
-        }
-        p = kmap(page);
-        gfs2_sb_in(&sdp->sd_sb, p);
-        kunmap(page);
-        __free_page(page);
-        return 0;
-}
-/**
- * gfs2_read_sb - Read super block
- * @sdp: The GFS2 superblock
- * @gl: the glock for the superblock (assumed to be held)
- * @silent: Don't print message if mount fails
- *
- */
-int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent)
-{
-        u32 hash_blocks, ind_blocks, leaf_blocks;
-        u32 tmp_blocks;
-        unsigned int x;
-        int error;
-        error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift);
-        if (error) {
-                if (!silent)
-                        fs_err(sdp, "can't read superblock\n");
-                return error;
-        }
-        error = gfs2_check_sb(sdp, &sdp->sd_sb, silent);
-        if (error)
-                return error;
-        sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
-                               GFS2_BASIC_BLOCK_SHIFT;
-        sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
-        sdp->sd_diptrs = (sdp->sd_sb.sb_bsize -
-                          sizeof(struct gfs2_dinode)) / sizeof(u64);
-        sdp->sd_inptrs = (sdp->sd_sb.sb_bsize -
-                          sizeof(struct gfs2_meta_header)) / sizeof(u64);
-        sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header);
-        sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2;
-        sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1;
-        sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(u64);
-        sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize -
-                                sizeof(struct gfs2_meta_header)) /
-                                sizeof(struct gfs2_quota_change);
-        /* Compute maximum reservation required to add a entry to a directory */
-        hash_blocks = DIV_ROUND_UP(sizeof(u64) * (1 << GFS2_DIR_MAX_DEPTH),
-                             sdp->sd_jbsize);
-        ind_blocks = 0;
-        for (tmp_blocks = hash_blocks; tmp_blocks > sdp->sd_diptrs;) {
-                tmp_blocks = DIV_ROUND_UP(tmp_blocks, sdp->sd_inptrs);
-                ind_blocks += tmp_blocks;
-        }
-        leaf_blocks = 2 + GFS2_DIR_MAX_DEPTH;
-        sdp->sd_max_dirres = hash_blocks + ind_blocks + leaf_blocks;
-        sdp->sd_heightsize[0] = sdp->sd_sb.sb_bsize -
-                                sizeof(struct gfs2_dinode);
-        sdp->sd_heightsize[1] = sdp->sd_sb.sb_bsize * sdp->sd_diptrs;
-        for (x = 2;; x++) {
-                u64 space, d;
-                u32 m;
-                space = sdp->sd_heightsize[x - 1] * sdp->sd_inptrs;
-                d = space;
-                m = do_div(d, sdp->sd_inptrs);
-                if (d != sdp->sd_heightsize[x - 1] || m)
-                        break;
-                sdp->sd_heightsize[x] = space;
-        }
-        sdp->sd_max_height = x;
-        sdp->sd_heightsize[x] = ~0;
-        gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT);
-        sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize -
-                                 sizeof(struct gfs2_dinode);
-        sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs;
-        for (x = 2;; x++) {
-                u64 space, d;
-                u32 m;
-                space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs;
-                d = space;
-                m = do_div(d, sdp->sd_inptrs);
-                if (d != sdp->sd_jheightsize[x - 1] || m)
-                        break;
-                sdp->sd_jheightsize[x] = space;
-        }
-        sdp->sd_max_jheight = x;
-        sdp->sd_jheightsize[x] = ~0;
-        gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
-        return 0;
-}
 /**
 * gfs2_jindex_hold - Grab a lock on the jindex
 * @sdp: The GFS2 superblock
@@ -581,39 +274,6 @@ fail:
        return error;
 }
-/**
- * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
- * @sdp: the filesystem
- *
- * Returns: errno
- */
-int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
-{
-        struct gfs2_holder t_gh;
-        int error;
-        gfs2_quota_sync(sdp);
-        gfs2_statfs_sync(sdp);
-        error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
-                                   &t_gh);
-        if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
-                return error;
-        gfs2_meta_syncfs(sdp);
-        gfs2_log_shutdown(sdp);
-        clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
-        if (t_gh.gh_gl)
-                gfs2_glock_dq_uninit(&t_gh);
-        gfs2_quota_cleanup(sdp);
-        return error;
-}
 static void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf)
 {
        const struct gfs2_statfs_change *str = buf;
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 44361ecc44f7..50a4c9b1215e 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -12,11 +12,6 @@
 #include "incore.h"
-void gfs2_tune_init(struct gfs2_tune *gt);
-int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent);
-int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent);
-int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector);
 void gfs2_lm_unmount(struct gfs2_sbd *sdp);
 static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
@@ -40,7 +35,6 @@ int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
                              struct gfs2_inode **ipp);
 int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
-int gfs2_make_fs_ro(struct gfs2_sbd *sdp);
 int gfs2_statfs_init(struct gfs2_sbd *sdp);
 void gfs2_statfs_change(struct gfs2_sbd *sdp,
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 74846559fc3f..7e1879f1a02c 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -269,14 +269,6 @@ ARGS_ATTR(quota,           "%u\n");
 ARGS_ATTR(suiddir,         "%d\n");
 ARGS_ATTR(data,            "%d\n");
-/* one oddball doesn't fit the macro mold */
-static ssize_t noatime_show(struct gfs2_sbd *sdp, char *buf)
-{
-        return snprintf(buf, PAGE_SIZE, "%d\n",
-                        !!test_bit(SDF_NOATIME, &sdp->sd_flags));
-}
-static struct args_attr args_attr_noatime = __ATTR_RO(noatime);
 static struct attribute *args_attrs[] = {
        &args_attr_lockproto.attr,
        &args_attr_locktable.attr,
@@ -292,7 +284,6 @@ static struct attribute *args_attrs[] = {
        &args_attr_quota.attr,
        &args_attr_suiddir.attr,
        &args_attr_data.attr,
-        &args_attr_noatime.attr,
        NULL,
 };
@@ -407,7 +398,6 @@ TUNE_ATTR(incore_log_blocks, 0);
 TUNE_ATTR(log_flush_secs, 0);
 TUNE_ATTR(quota_warn_period, 0);
 TUNE_ATTR(quota_quantum, 0);
-TUNE_ATTR(atime_quantum, 0);
 TUNE_ATTR(max_readahead, 0);
 TUNE_ATTR(complain_secs, 0);
 TUNE_ATTR(statfs_slow, 0);
@@ -427,7 +417,6 @@ static struct attribute *tune_attrs[] = {
        &tune_attr_log_flush_secs.attr,
        &tune_attr_quota_warn_period.attr,
        &tune_attr_quota_quantum.attr,
-        &tune_attr_atime_quantum.attr,
        &tune_attr_max_readahead.attr,
        &tune_attr_complain_secs.attr,
        &tune_attr_statfs_slow.attr,
diff --git a/fs/inotify_user.c b/fs/inotify_user.c
index 60249429a253..d85c7d931cdf 100644
--- a/fs/inotify_user.c
+++ b/fs/inotify_user.c
@@ -323,7 +323,7 @@ out:
 }
 /*
- * remove_kevent - cleans up and ultimately frees the given kevent
+ * remove_kevent - cleans up the given kevent
 *
 * Caller must hold dev->ev_mutex.
 */
@@ -334,7 +334,13 @@ static void remove_kevent(struct inotify_device *dev,
        dev->event_count--;
        dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len;
+}
+/*
+ * free_kevent - frees the given kevent.
+ */
+static void free_kevent(struct inotify_kernel_event *kevent)
+{
        kfree(kevent->name);
        kmem_cache_free(event_cachep, kevent);
 }
@@ -350,6 +356,7 @@ static void inotify_dev_event_dequeue(struct inotify_device *dev)
                struct inotify_kernel_event *kevent;
                kevent = inotify_dev_get_event(dev);
                remove_kevent(dev, kevent);
+                free_kevent(kevent);
        }
 }
@@ -433,17 +440,15 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
        dev = file->private_data;
        while (1) {
-                int events;
                prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE);
                mutex_lock(&dev->ev_mutex);
-                events = !list_empty(&dev->events);
+                if (!list_empty(&dev->events)) {
-                mutex_unlock(&dev->ev_mutex);
-                if (events) {
                        ret = 0;
                        break;
                }
+                mutex_unlock(&dev->ev_mutex);
                if (file->f_flags & O_NONBLOCK) {
                        ret = -EAGAIN;
@@ -462,7 +467,6 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
        if (ret)
                return ret;
-        mutex_lock(&dev->ev_mutex);
        while (1) {
                struct inotify_kernel_event *kevent;
@@ -481,6 +485,13 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
                        }
                        break;
                }
+                remove_kevent(dev, kevent);
+                /*
+                 * Must perform the copy_to_user outside the mutex in order
+                 * to avoid a lock order reversal with mmap_sem.
+                 */
+                mutex_unlock(&dev->ev_mutex);
                if (copy_to_user(buf, &kevent->event, event_size)) {
                        ret = -EFAULT;
@@ -498,7 +509,9 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
                        count -= kevent->event.len;
                }
-                remove_kevent(dev, kevent);
+                free_kevent(kevent);
+                mutex_lock(&dev->ev_mutex);
        }
        mutex_unlock(&dev->ev_mutex);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 9abcd2b329f7..e9b20173fef3 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1279,6 +1279,12 @@ static int nfs_parse_mount_options(char *raw,
                }
        }
+        if (errors > 0) {
+                dfprintk(MOUNT, "NFS: parsing encountered %d error%s\n",
+                                errors, (errors == 1 ? "" : "s"));
+                if (!sloppy)
+                        return 0;
+        }
        return 1;
 out_nomem:
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index b6ed38380ab8..54b8b4140c8f 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -443,7 +443,7 @@ init_state(struct posix_acl_state *state, int cnt)
         * enough space for either:
         */
        alloc = sizeof(struct posix_ace_state_array)
-                + cnt*sizeof(struct posix_ace_state);
+                + cnt*sizeof(struct posix_user_ace_state);
        state->users = kzalloc(alloc, GFP_KERNEL);
        if (!state->users)
                return -ENOMEM;
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 2e51adac65de..e5b51ffafc6c 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -867,11 +867,6 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
        int             slack_bytes;
        __be32          status;
-        status = nfserr_resource;
-        cstate = cstate_alloc();
-        if (cstate == NULL)
-                goto out;
        resp->xbuf = &rqstp->rq_res;
        resp->p = rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len;
        resp->tagp = resp->p;
@@ -890,6 +885,11 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
        if (args->minorversion > NFSD_SUPPORTED_MINOR_VERSION)
                goto out;
+        status = nfserr_resource;
+        cstate = cstate_alloc();
+        if (cstate == NULL)
+                goto out;
        status = nfs_ok;
        while (!status && resp->opcnt < args->opcnt) {
                op = &args->ops[resp->opcnt++];
@@ -957,9 +957,9 @@ encode_op:
                nfsd4_increment_op_stats(op->opnum);
        }
+        cstate_free(cstate);
 out:
        nfsd4_release_compoundargs(args);
-        cstate_free(cstate);
        dprintk("nfsv4 compound returned %d\n", ntohl(status));
        return status;
 }
diff --git a/fs/ntfs/usnjrnl.h b/fs/ntfs/usnjrnl.h
index 3a8af75351e8..4087fbdac327 100644
--- a/fs/ntfs/usnjrnl.h
+++ b/fs/ntfs/usnjrnl.h
@@ -113,7 +113,7 @@ typedef struct {
 * Reason flags (32-bit).  Cumulative flags describing the change(s) to the
 * file since it was last opened.  I think the names speak for themselves but
 * if you disagree check out the descriptions in the Linux NTFS project NTFS
- * documentation: http://linux-ntfs.sourceforge.net/ntfs/files/usnjrnl.html
+ * documentation: http://www.linux-ntfs.org/
 */
 enum {
        USN_REASON_DATA_OVERWRITE       = const_cpu_to_le32(0x00000001),
@@ -145,7 +145,7 @@ typedef le32 USN_REASON_FLAGS;
 * Source info flags (32-bit).  Information about the source of the change(s)
 * to the file.  For detailed descriptions of what these mean, see the Linux
 * NTFS project NTFS documentation:
- *      http://linux-ntfs.sourceforge.net/ntfs/files/usnjrnl.html
+ *      http://www.linux-ntfs.org/
 */
 enum {
        USN_SOURCE_DATA_MANAGEMENT        = const_cpu_to_le32(0x00000001),
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 506c24fb5078..a53da1466277 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -594,7 +594,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
                goto bail;
        }
-        if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno) {
+        if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno && create) {
                ocfs2_error(inode->i_sb,
                            "Inode %llu has a hole at block %llu\n",
                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 7d6b34e201db..7408227c49c9 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -120,22 +120,21 @@ static int (*check_part[])(struct parsed_partitions *, struct block_device *) =
 * a pointer to that same buffer (for convenience).
 */
-char *disk_name(struct gendisk *hd, int part, char *buf)
+char *disk_name(struct gendisk *hd, int partno, char *buf)
 {
-        if (!part)
+        if (!partno)
                snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name);
        else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1]))
-                snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, part);
+                snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno);
        else
-                snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, part);
+                snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno);
        return buf;
 }
 const char *bdevname(struct block_device *bdev, char *buf)
 {
-        int part = MINOR(bdev->bd_dev) - bdev->bd_disk->first_minor;
+        return disk_name(bdev->bd_disk, bdev->bd_part->partno, buf);
-        return disk_name(bdev->bd_disk, part, buf);
 }
 EXPORT_SYMBOL(bdevname);
@@ -169,7 +168,7 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
        if (isdigit(state->name[strlen(state->name)-1]))
                sprintf(state->name, "p");
-        state->limit = hd->minors;
+        state->limit = disk_max_parts(hd);
        i = res = err = 0;
        while (!res && check_part[i]) {
                memset(&state->parts, 0, sizeof(state->parts));
@@ -204,21 +203,22 @@ static ssize_t part_start_show(struct device *dev,
        return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect);
 }
-static ssize_t part_size_show(struct device *dev,
+ssize_t part_size_show(struct device *dev,
-                              struct device_attribute *attr, char *buf)
+                       struct device_attribute *attr, char *buf)
 {
        struct hd_struct *p = dev_to_part(dev);
        return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
 }
-static ssize_t part_stat_show(struct device *dev,
+ssize_t part_stat_show(struct device *dev,
-                              struct device_attribute *attr, char *buf)
+                       struct device_attribute *attr, char *buf)
 {
        struct hd_struct *p = dev_to_part(dev);
+        int cpu;
-        preempt_disable();
+        cpu = part_stat_lock();
-        part_round_stats(p);
+        part_round_stats(cpu, p);
-        preempt_enable();
+        part_stat_unlock();
        return sprintf(buf,
                "%8lu %8lu %8llu %8u "
                "%8lu %8lu %8llu %8u "
@@ -238,17 +238,17 @@ static ssize_t part_stat_show(struct device *dev,
 }
 #ifdef CONFIG_FAIL_MAKE_REQUEST
-static ssize_t part_fail_show(struct device *dev,
+ssize_t part_fail_show(struct device *dev,
-                              struct device_attribute *attr, char *buf)
+                       struct device_attribute *attr, char *buf)
 {
        struct hd_struct *p = dev_to_part(dev);
        return sprintf(buf, "%d\n", p->make_it_fail);
 }
-static ssize_t part_fail_store(struct device *dev,
+ssize_t part_fail_store(struct device *dev,
-                               struct device_attribute *attr,
+                        struct device_attribute *attr,
-                               const char *buf, size_t count)
+                        const char *buf, size_t count)
 {
        struct hd_struct *p = dev_to_part(dev);
        int i;
@@ -300,40 +300,34 @@ struct device_type part_type = {
        .release        = part_release,
 };
-static inline void partition_sysfs_add_subdir(struct hd_struct *p)
+static void delete_partition_rcu_cb(struct rcu_head *head)
-{
-        struct kobject *k;
-        k = kobject_get(&p->dev.kobj);
-        p->holder_dir = kobject_create_and_add("holders", k);
-        kobject_put(k);
-}
-static inline void disk_sysfs_add_subdirs(struct gendisk *disk)
 {
-        struct kobject *k;
+        struct hd_struct *part = container_of(head, struct hd_struct, rcu_head);
-        k = kobject_get(&disk->dev.kobj);
+        part->start_sect = 0;
-        disk->holder_dir = kobject_create_and_add("holders", k);
+        part->nr_sects = 0;
-        disk->slave_dir = kobject_create_and_add("slaves", k);
+        part_stat_set_all(part, 0);
-        kobject_put(k);
+        put_device(part_to_dev(part));
 }
-void delete_partition(struct gendisk *disk, int part)
+void delete_partition(struct gendisk *disk, int partno)
 {
-        struct hd_struct *p = disk->part[part-1];
+        struct disk_part_tbl *ptbl = disk->part_tbl;
+        struct hd_struct *part;
-        if (!p)
+        if (partno >= ptbl->len)
                return;
-        if (!p->nr_sects)
+        part = ptbl->part[partno];
+        if (!part)
                return;
-        disk->part[part-1] = NULL;
-        p->start_sect = 0;
+        blk_free_devt(part_devt(part));
-        p->nr_sects = 0;
+        rcu_assign_pointer(ptbl->part[partno], NULL);
-        part_stat_set_all(p, 0);
+        kobject_put(part->holder_dir);
-        kobject_put(p->holder_dir);
+        device_del(part_to_dev(part));
-        device_del(&p->dev);
-        put_device(&p->dev);
+        call_rcu(&part->rcu_head, delete_partition_rcu_cb);
 }
 static ssize_t whole_disk_show(struct device *dev,
@@ -344,102 +338,132 @@ static ssize_t whole_disk_show(struct device *dev,
 static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH,
                   whole_disk_show, NULL);
-int add_partition(struct gendisk *disk, int part, sector_t start, sector_t len, int flags)
+int add_partition(struct gendisk *disk, int partno,
+                  sector_t start, sector_t len, int flags)
 {
        struct hd_struct *p;
+        dev_t devt = MKDEV(0, 0);
+        struct device *ddev = disk_to_dev(disk);
+        struct device *pdev;
+        struct disk_part_tbl *ptbl;
+        const char *dname;
        int err;
+        err = disk_expand_part_tbl(disk, partno);
+        if (err)
+                return err;
+        ptbl = disk->part_tbl;
+        if (ptbl->part[partno])
+                return -EBUSY;
        p = kzalloc(sizeof(*p), GFP_KERNEL);
        if (!p)
                return -ENOMEM;
        if (!init_part_stats(p)) {
                err = -ENOMEM;
-                goto out0;
+                goto out_free;
        }
+        pdev = part_to_dev(p);
        p->start_sect = start;
        p->nr_sects = len;
-        p->partno = part;
+        p->partno = partno;
-        p->policy = disk->policy;
+        p->policy = get_disk_ro(disk);
-        if (isdigit(disk->dev.bus_id[strlen(disk->dev.bus_id)-1]))
+        dname = dev_name(ddev);
-                snprintf(p->dev.bus_id, BUS_ID_SIZE,
+        if (isdigit(dname[strlen(dname) - 1]))
-                "%sp%d", disk->dev.bus_id, part);
+                snprintf(pdev->bus_id, BUS_ID_SIZE, "%sp%d", dname, partno);
        else
-                snprintf(p->dev.bus_id, BUS_ID_SIZE,
+                snprintf(pdev->bus_id, BUS_ID_SIZE, "%s%d", dname, partno);
-                         "%s%d", disk->dev.bus_id, part);
+        device_initialize(pdev);
+        pdev->class = &block_class;
+        pdev->type = &part_type;
+        pdev->parent = ddev;
-        device_initialize(&p->dev);
+        err = blk_alloc_devt(p, &devt);
-        p->dev.devt = MKDEV(disk->major, disk->first_minor + part);
+        if (err)
-        p->dev.class = &block_class;
+                goto out_free;
-        p->dev.type = &part_type;
+        pdev->devt = devt;
-        p->dev.parent = &disk->dev;
-        disk->part[part-1] = p;
        /* delay uevent until 'holders' subdir is created */
-        p->dev.uevent_suppress = 1;
+        pdev->uevent_suppress = 1;
-        err = device_add(&p->dev);
+        err = device_add(pdev);
        if (err)
-                goto out1;
+                goto out_put;
-        partition_sysfs_add_subdir(p);
-        p->dev.uevent_suppress = 0;
+        err = -ENOMEM;
+        p->holder_dir = kobject_create_and_add("holders", &pdev->kobj);
+        if (!p->holder_dir)
+                goto out_del;
+        pdev->uevent_suppress = 0;
        if (flags & ADDPART_FLAG_WHOLEDISK) {
-                err = device_create_file(&p->dev, &dev_attr_whole_disk);
+                err = device_create_file(pdev, &dev_attr_whole_disk);
                if (err)
-                        goto out2;
+                        goto out_del;
        }
+        /* everything is up and running, commence */
+        INIT_RCU_HEAD(&p->rcu_head);
+        rcu_assign_pointer(ptbl->part[partno], p);
        /* suppress uevent if the disk supresses it */
-        if (!disk->dev.uevent_suppress)
+        if (!ddev->uevent_suppress)
-                kobject_uevent(&p->dev.kobj, KOBJ_ADD);
+                kobject_uevent(&pdev->kobj, KOBJ_ADD);
        return 0;
-out2:
+out_free:
-        device_del(&p->dev);
-out1:
-        put_device(&p->dev);
-        free_part_stats(p);
-out0:
        kfree(p);
        return err;
+out_del:
+        kobject_put(p->holder_dir);
+        device_del(pdev);
+out_put:
+        put_device(pdev);
+        blk_free_devt(devt);
+        return err;
 }
 /* Not exported, helper to add_disk(). */
 void register_disk(struct gendisk *disk)
 {
+        struct device *ddev = disk_to_dev(disk);
        struct block_device *bdev;
+        struct disk_part_iter piter;
+        struct hd_struct *part;
        char *s;
-        int i;
-        struct hd_struct *p;
        int err;
-        disk->dev.parent = disk->driverfs_dev;
+        ddev->parent = disk->driverfs_dev;
-        disk->dev.devt = MKDEV(disk->major, disk->first_minor);
-        strlcpy(disk->dev.bus_id, disk->disk_name, BUS_ID_SIZE);
+        strlcpy(ddev->bus_id, disk->disk_name, BUS_ID_SIZE);
        /* ewww... some of these buggers have / in the name... */
-        s = strchr(disk->dev.bus_id, '/');
+        s = strchr(ddev->bus_id, '/');
        if (s)
                *s = '!';
        /* delay uevents, until we scanned partition table */
-        disk->dev.uevent_suppress = 1;
+        ddev->uevent_suppress = 1;
-        if (device_add(&disk->dev))
+        if (device_add(ddev))
                return;
 #ifndef CONFIG_SYSFS_DEPRECATED
-        err = sysfs_create_link(block_depr, &disk->dev.kobj,
+        err = sysfs_create_link(block_depr, &ddev->kobj,
-                                kobject_name(&disk->dev.kobj));
+                                kobject_name(&ddev->kobj));
        if (err) {
-                device_del(&disk->dev);
+                device_del(ddev);
                return;
        }
 #endif
-        disk_sysfs_add_subdirs(disk);
+        disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
+        disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
        /* No minors to use for partitions */
-        if (disk->minors == 1)
+        if (!disk_partitionable(disk))
                goto exit;
        /* No such device (e.g., media were just removed) */
@@ -458,50 +482,66 @@ void register_disk(struct gendisk *disk)
 exit:
        /* announce disk after possible partitions are created */
-        disk->dev.uevent_suppress = 0;
+        ddev->uevent_suppress = 0;
-        kobject_uevent(&disk->dev.kobj, KOBJ_ADD);
+        kobject_uevent(&ddev->kobj, KOBJ_ADD);
        /* announce possible partitions */
-        for (i = 1; i < disk->minors; i++) {
+        disk_part_iter_init(&piter, disk, 0);
-                p = disk->part[i-1];
+        while ((part = disk_part_iter_next(&piter)))
-                if (!p || !p->nr_sects)
+                kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
-                        continue;
+        disk_part_iter_exit(&piter);
-                kobject_uevent(&p->dev.kobj, KOBJ_ADD);
-        }
 }
 int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 {
+        struct disk_part_iter piter;
+        struct hd_struct *part;
        struct parsed_partitions *state;
-        int p, res;
+        int p, highest, res;
        if (bdev->bd_part_count)
                return -EBUSY;
        res = invalidate_partition(disk, 0);
        if (res)
                return res;
-        bdev->bd_invalidated = 0;
-        for (p = 1; p < disk->minors; p++)
+        disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
-                delete_partition(disk, p);
+        while ((part = disk_part_iter_next(&piter)))
+                delete_partition(disk, part->partno);
+        disk_part_iter_exit(&piter);
        if (disk->fops->revalidate_disk)
                disk->fops->revalidate_disk(disk);
+        check_disk_size_change(disk, bdev);
+        bdev->bd_invalidated = 0;
        if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
                return 0;
        if (IS_ERR(state))      /* I/O error reading the partition table */
                return -EIO;
        /* tell userspace that the media / partition table may have changed */
-        kobject_uevent(&disk->dev.kobj, KOBJ_CHANGE);
+        kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
+        /* Detect the highest partition number and preallocate
+         * disk->part_tbl.  This is an optimization and not strictly
+         * necessary.
+         */
+        for (p = 1, highest = 0; p < state->limit; p++)
+                if (state->parts[p].size)
+                        highest = p;
+        disk_expand_part_tbl(disk, highest);
+        /* add partitions */
        for (p = 1; p < state->limit; p++) {
                sector_t size = state->parts[p].size;
                sector_t from = state->parts[p].from;
                if (!size)
                        continue;
                if (from + size > get_capacity(disk)) {
-                        printk(KERN_ERR " %s: p%d exceeds device capacity\n",
+                        printk(KERN_WARNING
+                                "%s: p%d exceeds device capacity\n",
                                disk->disk_name, p);
-                        continue;
                }
                res = add_partition(disk, p, from, size, state->parts[p].flags);
                if (res) {
@@ -541,25 +581,31 @@ EXPORT_SYMBOL(read_dev_sector);
 void del_gendisk(struct gendisk *disk)
 {
-        int p;
+        struct disk_part_iter piter;
+        struct hd_struct *part;
        /* invalidate stuff */
-        for (p = disk->minors - 1; p > 0; p--) {
+        disk_part_iter_init(&piter, disk,
-                invalidate_partition(disk, p);
+                             DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
-                delete_partition(disk, p);
+        while ((part = disk_part_iter_next(&piter))) {
+                invalidate_partition(disk, part->partno);
+                delete_partition(disk, part->partno);
        }
+        disk_part_iter_exit(&piter);
        invalidate_partition(disk, 0);
-        disk->capacity = 0;
+        blk_free_devt(disk_to_dev(disk)->devt);
+        set_capacity(disk, 0);
        disk->flags &= ~GENHD_FL_UP;
        unlink_gendisk(disk);
-        disk_stat_set_all(disk, 0);
+        part_stat_set_all(&disk->part0, 0);
-        disk->stamp = 0;
+        disk->part0.stamp = 0;
-        kobject_put(disk->holder_dir);
+        kobject_put(disk->part0.holder_dir);
        kobject_put(disk->slave_dir);
        disk->driverfs_dev = NULL;
 #ifndef CONFIG_SYSFS_DEPRECATED
-        sysfs_remove_link(block_depr, disk->dev.bus_id);
+        sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
 #endif
-        device_del(&disk->dev);
+        device_del(disk_to_dev(disk));
 }
diff --git a/fs/partitions/check.h b/fs/partitions/check.h
index 17ae8ecd9e8b..98dbe1a84528 100644
--- a/fs/partitions/check.h
+++ b/fs/partitions/check.h
@@ -5,15 +5,13 @@
 * add_gd_partition adds a partitions details to the devices partition
 * description.
 */
-enum { MAX_PART = 256 };
 struct parsed_partitions {
        char name[BDEVNAME_SIZE];
        struct {
                sector_t from;
                sector_t size;
                int flags;
-        } parts[MAX_PART];
+        } parts[DISK_MAX_PARTS];
        int next;
        int limit;
 };
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 0d6eb33597c6..71c9be59c9c2 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -337,65 +337,6 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
        return 0;
 }
-/*
- * Use precise platform statistics if available:
- */
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
-static cputime_t task_utime(struct task_struct *p)
-{
-        return p->utime;
-}
-static cputime_t task_stime(struct task_struct *p)
-{
-        return p->stime;
-}
-#else
-static cputime_t task_utime(struct task_struct *p)
-{
-        clock_t utime = cputime_to_clock_t(p->utime),
-                total = utime + cputime_to_clock_t(p->stime);
-        u64 temp;
-        /*
-         * Use CFS's precise accounting:
-         */
-        temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
-        if (total) {
-                temp *= utime;
-                do_div(temp, total);
-        }
-        utime = (clock_t)temp;
-        p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
-        return p->prev_utime;
-}
-static cputime_t task_stime(struct task_struct *p)
-{
-        clock_t stime;
-        /*
-         * Use CFS's precise accounting. (we subtract utime from
-         * the total, to make sure the total observed by userspace
-         * grows monotonically - apps rely on that):
-         */
-        stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
-                        cputime_to_clock_t(task_utime(p));
-        if (stime >= 0)
-                p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
-        return p->prev_stime;
-}
-#endif
-static cputime_t task_gtime(struct task_struct *p)
-{
-        return p->gtime;
-}
 static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
                        struct pid *pid, struct task_struct *task, int whole)
 {
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index bca0f81eb687..7821589a17d5 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -547,8 +547,8 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
        for (tmp = dir->subdir; tmp; tmp = tmp->next)
                if (strcmp(tmp->name, dp->name) == 0) {
-                        printk(KERN_WARNING "proc_dir_entry '%s' already "
+                        printk(KERN_WARNING "proc_dir_entry '%s/%s' already registered\n",
-                                        "registered\n", dp->name);
+                                dir->name, dp->name);
                        dump_stack();
                        break;
                }
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index ded969862960..29e20c6b1f7f 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -24,6 +24,7 @@
 #include <linux/tty.h>
 #include <linux/string.h>
 #include <linux/mman.h>
+#include <linux/quicklist.h>
 #include <linux/proc_fs.h>
 #include <linux/ioport.h>
 #include <linux/mm.h>
@@ -182,6 +183,9 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
                "SReclaimable: %8lu kB\n"
                "SUnreclaim:   %8lu kB\n"
                "PageTables:   %8lu kB\n"
+#ifdef CONFIG_QUICKLIST
+                "Quicklists:   %8lu kB\n"
+#endif
                "NFS_Unstable: %8lu kB\n"
                "Bounce:       %8lu kB\n"
                "WritebackTmp: %8lu kB\n"
@@ -214,6 +218,9 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
                K(global_page_state(NR_SLAB_RECLAIMABLE)),
                K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
                K(global_page_state(NR_PAGETABLE)),
+#ifdef CONFIG_QUICKLIST
+                K(quicklist_total_size()),
+#endif
                K(global_page_state(NR_UNSTABLE_NFS)),
                K(global_page_state(NR_BOUNCE)),
                K(global_page_state(NR_WRITEBACK_TEMP)),
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 52312ec93ff4..5145cb9125af 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -58,7 +58,7 @@ const struct inode_operations ramfs_file_inode_operations = {
 * size 0 on the assumption that it's going to be used for an mmap of shared
 * memory
 */
-static int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
+int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
 {
        struct pagevec lru_pvec;
        unsigned long npages, xpages, loop, limit;
diff --git a/fs/splice.c b/fs/splice.c
index 1bbc6f4bb09c..a1e701c27156 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -898,6 +898,9 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
        if (unlikely(!(out->f_mode & FMODE_WRITE)))
                return -EBADF;
+        if (unlikely(out->f_flags & O_APPEND))
+                return -EINVAL;
        ret = rw_verify_area(WRITE, out, ppos, len);
        if (unlikely(ret < 0))
                return ret;
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 154098157473..73db464cd08b 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -302,18 +302,6 @@ long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs)
        int subtract_lebs;
        long long available;
-        /*
-         * Force the amount available to the total size reported if the used
-         * space is zero.
-         */
-        if (c->lst.total_used <= UBIFS_INO_NODE_SZ &&
-            c->budg_data_growth + c->budg_dd_growth == 0) {
-                /* Do the same calculation as for c->block_cnt */
-                available = c->main_lebs - 2;
-                available *= c->leb_size - c->dark_wm;
-                return available;
-        }
        available = c->main_bytes - c->lst.total_used;
        /*
@@ -714,34 +702,106 @@ void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
 }
 /**
- * ubifs_budg_get_free_space - return amount of free space.
+ * ubifs_reported_space - calculate reported free space.
+ * @c: the UBIFS file-system description object
+ * @free: amount of free space
+ *
+ * This function calculates amount of free space which will be reported to
+ * user-space. User-space application tend to expect that if the file-system
+ * (e.g., via the 'statfs()' call) reports that it has N bytes available, they
+ * are able to write a file of size N. UBIFS attaches node headers to each data
+ * node and it has to write indexind nodes as well. This introduces additional
+ * overhead, and UBIFS it has to report sligtly less free space to meet the
+ * above expectetion.
+ *
+ * This function assumes free space is made up of uncompressed data nodes and
+ * full index nodes (one per data node, tripled because we always allow enough
+ * space to write the index thrice).
+ *
+ * Note, the calculation is pessimistic, which means that most of the time
+ * UBIFS reports less space than it actually has.
+ */
+long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free)
+{
+        int divisor, factor, f;
+        /*
+         * Reported space size is @free * X, where X is UBIFS block size
+         * divided by UBIFS block size + all overhead one data block
+         * introduces. The overhead is the node header + indexing overhead.
+         *
+         * Indexing overhead calculations are based on the following formula:
+         * I = N/(f - 1) + 1, where I - number of indexing nodes, N - number
+         * of data nodes, f - fanout. Because effective UBIFS fanout is twice
+         * as less than maximum fanout, we assume that each data node
+         * introduces 3 * @c->max_idx_node_sz / (@c->fanout/2 - 1) bytes.
+         * Note, the multiplier 3 is because UBIFS reseves thrice as more space
+         * for the index.
+         */
+        f = c->fanout > 3 ? c->fanout >> 1 : 2;
+        factor = UBIFS_BLOCK_SIZE;
+        divisor = UBIFS_MAX_DATA_NODE_SZ;
+        divisor += (c->max_idx_node_sz * 3) / (f - 1);
+        free *= factor;
+        do_div(free, divisor);
+        return free;
+}
+/**
+ * ubifs_get_free_space - return amount of free space.
 * @c: UBIFS file-system description object
 *
- * This function returns amount of free space on the file-system.
+ * This function calculates amount of free space to report to user-space.
+ *
+ * Because UBIFS may introduce substantial overhead (the index, node headers,
+ * alighment, wastage at the end of eraseblocks, etc), it cannot report real
+ * amount of free flash space it has (well, because not all dirty space is
+ * reclamable, UBIFS does not actually know the real amount). If UBIFS did so,
+ * it would bread user expectetion about what free space is. Users seem to
+ * accustomed to assume that if the file-system reports N bytes of free space,
+ * they would be able to fit a file of N bytes to the FS. This almost works for
+ * traditional file-systems, because they have way less overhead than UBIFS.
+ * So, to keep users happy, UBIFS tries to take the overhead into account.
 */
-long long ubifs_budg_get_free_space(struct ubifs_info *c)
+long long ubifs_get_free_space(struct ubifs_info *c)
 {
-        int min_idx_lebs, rsvd_idx_lebs;
+        int min_idx_lebs, rsvd_idx_lebs, lebs;
        long long available, outstanding, free;
-        /* Do exactly the same calculations as in 'do_budget_space()' */
        spin_lock(&c->space_lock);
        min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+        outstanding = c->budg_data_growth + c->budg_dd_growth;
-        if (min_idx_lebs > c->lst.idx_lebs)
+        /*
-                rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs;
+         * Force the amount available to the total size reported if the used
-        else
+         * space is zero.
-                rsvd_idx_lebs = 0;
+         */
+        if (c->lst.total_used <= UBIFS_INO_NODE_SZ && !outstanding) {
-        if (rsvd_idx_lebs > c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt
-                                - c->lst.taken_empty_lebs) {
                spin_unlock(&c->space_lock);
-                return 0;
+                return (long long)c->block_cnt << UBIFS_BLOCK_SHIFT;
        }
        available = ubifs_calc_available(c, min_idx_lebs);
-        outstanding = c->budg_data_growth + c->budg_dd_growth;
-        c->min_idx_lebs = min_idx_lebs;
+        /*
+         * When reporting free space to user-space, UBIFS guarantees that it is
+         * possible to write a file of free space size. This means that for
+         * empty LEBs we may use more precise calculations than
+         * 'ubifs_calc_available()' is using. Namely, we know that in empty
+         * LEBs we would waste only @c->leb_overhead bytes, not @c->dark_wm.
+         * Thus, amend the available space.
+         *
+         * Note, the calculations below are similar to what we have in
+         * 'do_budget_space()', so refer there for comments.
+         */
+        if (min_idx_lebs > c->lst.idx_lebs)
+                rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs;
+        else
+                rsvd_idx_lebs = 0;
+        lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
+               c->lst.taken_empty_lebs;
+        lebs -= rsvd_idx_lebs;
+        available += lebs * (c->dark_wm - c->leb_overhead);
        spin_unlock(&c->space_lock);
        if (available > outstanding)
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index b9cb77473758..d7f7645779f2 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -538,7 +538,7 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
                printk(KERN_DEBUG "\t%d orphan inode numbers:\n", n);
                for (i = 0; i < n; i++)
                        printk(KERN_DEBUG "\t  ino %llu\n",
-                               le64_to_cpu(orph->inos[i]));
+                               (unsigned long long)le64_to_cpu(orph->inos[i]));
                break;
        }
        default:
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 5c96f1fb7016..526c01ec8003 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -426,7 +426,7 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
        while (1) {
                dbg_gen("feed '%s', ino %llu, new f_pos %#x",
-                        dent->name, le64_to_cpu(dent->inum),
+                        dent->name, (unsigned long long)le64_to_cpu(dent->inum),
                        key_hash_flash(c, &dent->key));
                ubifs_assert(dent->ch.sqnum > ubifs_inode(dir)->creat_sqnum);
@@ -587,7 +587,6 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
        if (err) {
                if (err != -ENOSPC)
                        return err;
-                err = 0;
                budgeted = 0;
        }
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 4071d1cae29f..3d698e2022b1 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -793,7 +793,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
        int err;
        struct ubifs_budget_req req;
        loff_t old_size = inode->i_size, new_size = attr->ia_size;
-        int offset = new_size & (UBIFS_BLOCK_SIZE - 1);
+        int offset = new_size & (UBIFS_BLOCK_SIZE - 1), budgeted = 1;
        struct ubifs_inode *ui = ubifs_inode(inode);
        dbg_gen("ino %lu, size %lld -> %lld", inode->i_ino, old_size, new_size);
@@ -811,8 +811,15 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
        /* A funny way to budget for truncation node */
        req.dirtied_ino_d = UBIFS_TRUN_NODE_SZ;
        err = ubifs_budget_space(c, &req);
-        if (err)
+        if (err) {
-                return err;
+                /*
+                 * Treat truncations to zero as deletion and always allow them,
+                 * just like we do for '->unlink()'.
+                 */
+                if (new_size || err != -ENOSPC)
+                        return err;
+                budgeted = 0;
+        }
        err = vmtruncate(inode, new_size);
        if (err)
@@ -869,7 +876,12 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
        err = ubifs_jnl_truncate(c, inode, old_size, new_size);
        mutex_unlock(&ui->ui_mutex);
 out_budg:
-        ubifs_release_budget(c, &req);
+        if (budgeted)
+                ubifs_release_budget(c, &req);
+        else {
+                c->nospace = c->nospace_rp = 0;
+                smp_wmb();
+        }
        return err;
 }
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index adee7b5ddeab..47814cde2407 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -211,14 +211,8 @@ static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c,
 * dirty index heap, and it falls-back to LPT scanning if the heaps are empty
 * or do not have an LEB which satisfies the @min_space criteria.
 *
- * Note:
+ * Note, LEBs which have less than dead watermark of free + dirty space are
- *   o LEBs which have less than dead watermark of dirty space are never picked
+ * never picked by this function.
- *   by this function;
- *
- * Returns zero and the LEB properties of
- * found dirty LEB in case of success, %-ENOSPC if no dirty LEB was found and a
- * negative error code in case of other failures. The returned LEB is marked as
- * "taken".
 *
 * The additional @pick_free argument controls if this function has to return a
 * free or freeable LEB if one is present. For example, GC must to set it to %1,
@@ -231,6 +225,10 @@ static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c,
 *
 * In addition @pick_free is set to %2 by the recovery process in order to
 * recover gc_lnum in which case an index LEB must not be returned.
+ *
+ * This function returns zero and the LEB properties of found dirty LEB in case
+ * of success, %-ENOSPC if no dirty LEB was found and a negative error code in
+ * case of other failures. The returned LEB is marked as "taken".
 */
 int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
                         int min_space, int pick_free)
@@ -245,7 +243,7 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
                int lebs, rsvd_idx_lebs = 0;
                spin_lock(&c->space_lock);
-                lebs = c->lst.empty_lebs;
+                lebs = c->lst.empty_lebs + c->idx_gc_cnt;
                lebs += c->freeable_cnt - c->lst.taken_empty_lebs;
                /*
@@ -317,7 +315,7 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
                lp = idx_lp;
        if (lp) {
-                ubifs_assert(lp->dirty >= c->dead_wm);
+                ubifs_assert(lp->free + lp->dirty >= c->dead_wm);
                goto found;
        }
@@ -509,7 +507,6 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
                rsvd_idx_lebs = 0;
        lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
               c->lst.taken_empty_lebs;
-        ubifs_assert(lebs + c->lst.idx_lebs >= c->min_idx_lebs);
        if (rsvd_idx_lebs < lebs)
                /*
                 * OK to allocate an empty LEB, but we still don't want to go
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index d0f3dac29081..02aba36fe3d4 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -334,15 +334,21 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
                err = move_nodes(c, sleb);
                if (err)
-                        goto out;
+                        goto out_inc_seq;
                err = gc_sync_wbufs(c);
                if (err)
-                        goto out;
+                        goto out_inc_seq;
                err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0, 0, 0);
                if (err)
-                        goto out;
+                        goto out_inc_seq;
+                /* Allow for races with TNC */
+                c->gced_lnum = lnum;
+                smp_wmb();
+                c->gc_seq += 1;
+                smp_wmb();
                if (c->gc_lnum == -1) {
                        c->gc_lnum = lnum;
@@ -363,6 +369,14 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
 out:
        ubifs_scan_destroy(sleb);
        return err;
+out_inc_seq:
+        /* We may have moved at least some nodes so allow for races with TNC */
+        c->gced_lnum = lnum;
+        smp_wmb();
+        c->gc_seq += 1;
+        smp_wmb();
+        goto out;
 }
 /**
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
index 87dabf9fe742..4c12a9215d7f 100644
--- a/fs/ubifs/misc.h
+++ b/fs/ubifs/misc.h
@@ -284,38 +284,6 @@ static inline void *ubifs_idx_key(const struct ubifs_info *c,
 }
 /**
- * ubifs_reported_space - calculate reported free space.
- * @c: the UBIFS file-system description object
- * @free: amount of free space
- *
- * This function calculates amount of free space which will be reported to
- * user-space. User-space application tend to expect that if the file-system
- * (e.g., via the 'statfs()' call) reports that it has N bytes available, they
- * are able to write a file of size N. UBIFS attaches node headers to each data
- * node and it has to write indexind nodes as well. This introduces additional
- * overhead, and UBIFS it has to report sligtly less free space to meet the
- * above expectetion.
- *
- * This function assumes free space is made up of uncompressed data nodes and
- * full index nodes (one per data node, doubled because we always allow enough
- * space to write the index twice).
- *
- * Note, the calculation is pessimistic, which means that most of the time
- * UBIFS reports less space than it actually has.
- */
-static inline long long ubifs_reported_space(const struct ubifs_info *c,
-                                             uint64_t free)
-{
-        int divisor, factor;
-        divisor = UBIFS_MAX_DATA_NODE_SZ + (c->max_idx_node_sz * 3);
-        factor = UBIFS_MAX_DATA_NODE_SZ - UBIFS_DATA_NODE_SZ;
-        do_div(free, divisor);
-        return free * factor;
-}
-/**
 * ubifs_current_time - round current time to time granularity.
 * @inode: inode
 */
@@ -325,4 +293,21 @@ static inline struct timespec ubifs_current_time(struct inode *inode)
                current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
 }
+/**
+ * ubifs_tnc_lookup - look up a file-system node.
+ * @c: UBIFS file-system description object
+ * @key: node key to lookup
+ * @node: the node is returned here
+ *
+ * This function look up and reads node with key @key. The caller has to make
+ * sure the @node buffer is large enough to fit the node. Returns zero in case
+ * of success, %-ENOENT if the node was not found, and a negative error code in
+ * case of failure.
+ */
+static inline int ubifs_tnc_lookup(struct ubifs_info *c,
+                                   const union ubifs_key *key, void *node)
+{
+        return ubifs_tnc_locate(c, key, node, NULL, NULL);
+}
 #endif /* __UBIFS_MISC_H__ */
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index f71e6b8822c4..3f4902060c7a 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -370,8 +370,9 @@ static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct ubifs_info *c = dentry->d_sb->s_fs_info;
        unsigned long long free;
+        __le32 *uuid = (__le32 *)c->uuid;
-        free = ubifs_budg_get_free_space(c);
+        free = ubifs_get_free_space(c);
        dbg_gen("free space %lld bytes (%lld blocks)",
                free, free >> UBIFS_BLOCK_SHIFT);
@@ -386,7 +387,8 @@ static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_files = 0;
        buf->f_ffree = 0;
        buf->f_namelen = UBIFS_MAX_NLEN;
+        buf->f_fsid.val[0] = le32_to_cpu(uuid[0]) ^ le32_to_cpu(uuid[2]);
+        buf->f_fsid.val[1] = le32_to_cpu(uuid[1]) ^ le32_to_cpu(uuid[3]);
        return 0;
 }
@@ -530,6 +532,12 @@ static int init_constants_early(struct ubifs_info *c)
        c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size);
        c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size);
+        /*
+         * Calculate how many bytes would be wasted at the end of LEB if it was
+         * fully filled with data nodes of maximum size. This is used in
+         * calculations when reporting free space.
+         */
+        c->leb_overhead = c->leb_size % UBIFS_MAX_DATA_NODE_SZ;
        return 0;
 }
@@ -647,13 +655,11 @@ static int init_constants_late(struct ubifs_info *c)
         * internally because it does not make much sense for UBIFS, but it is
         * necessary to report something for the 'statfs()' call.
         *
-         * Subtract the LEB reserved for GC and the LEB which is reserved for
+         * Subtract the LEB reserved for GC, the LEB which is reserved for
-         * deletions.
+         * deletions, and assume only one journal head is available.
-         *
-         * Review 'ubifs_calc_available()' if changing this calculation.
         */
-        tmp64 = c->main_lebs - 2;
+        tmp64 = c->main_lebs - 2 - c->jhead_cnt + 1;
-        tmp64 *= (uint64_t)c->leb_size - c->dark_wm;
+        tmp64 *= (uint64_t)c->leb_size - c->leb_overhead;
        tmp64 = ubifs_reported_space(c, tmp64);
        c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT;
@@ -1018,14 +1024,13 @@ static int mount_ubifs(struct ubifs_info *c)
                goto out_dereg;
        }
+        sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id);
        if (!mounted_read_only) {
                err = alloc_wbufs(c);
                if (err)
                        goto out_cbuf;
                /* Create background thread */
-                sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num,
-                        c->vi.vol_id);
                c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name);
                if (!c->bgt)
                        c->bgt = ERR_PTR(-EINVAL);
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index e909f4a96443..7634c5970887 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -506,7 +506,7 @@ static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key,
                if (keys_cmp(c, key, &node_key) != 0)
                        ret = 0;
        }
-        if (ret == 0)
+        if (ret == 0 && c->replaying)
                dbg_mnt("dangling branch LEB %d:%d len %d, key %s",
                        zbr->lnum, zbr->offs, zbr->len, DBGKEY(key));
        return ret;
@@ -1382,50 +1382,39 @@ static int lookup_level0_dirty(struct ubifs_info *c, const union ubifs_key *key,
 }
 /**
- * ubifs_tnc_lookup - look up a file-system node.
+ * maybe_leb_gced - determine if a LEB may have been garbage collected.
 * @c: UBIFS file-system description object
- * @key: node key to lookup
+ * @lnum: LEB number
- * @node: the node is returned here
+ * @gc_seq1: garbage collection sequence number
 *
- * This function look up and reads node with key @key. The caller has to make
+ * This function determines if @lnum may have been garbage collected since
- * sure the @node buffer is large enough to fit the node. Returns zero in case
+ * sequence number @gc_seq1. If it may have been then %1 is returned, otherwise
- * of success, %-ENOENT if the node was not found, and a negative error code in
+ * %0 is returned.
- * case of failure.
 */
-int ubifs_tnc_lookup(struct ubifs_info *c, const union ubifs_key *key,
+static int maybe_leb_gced(struct ubifs_info *c, int lnum, int gc_seq1)
-                     void *node)
 {
-        int found, n, err;
+        int gc_seq2, gced_lnum;
-        struct ubifs_znode *znode;
-        struct ubifs_zbranch zbr, *zt;
-        mutex_lock(&c->tnc_mutex);
+        gced_lnum = c->gced_lnum;
-        found = ubifs_lookup_level0(c, key, &znode, &n);
+        smp_rmb();
-        if (!found) {
+        gc_seq2 = c->gc_seq;
-                err = -ENOENT;
+        /* Same seq means no GC */
-                goto out;
+        if (gc_seq1 == gc_seq2)
-        } else if (found < 0) {
+                return 0;
-                err = found;
+        /* Different by more than 1 means we don't know */
-                goto out;
+        if (gc_seq1 + 1 != gc_seq2)
-        }
+                return 1;
-        zt = &znode->zbranch[n];
+        /*
-        if (is_hash_key(c, key)) {
+         * We have seen the sequence number has increased by 1. Now we need to
-                /*
+         * be sure we read the right LEB number, so read it again.
-                 * In this case the leaf node cache gets used, so we pass the
+         */
-                 * address of the zbranch and keep the mutex locked
+        smp_rmb();
-                 */
+        if (gced_lnum != c->gced_lnum)
-                err = tnc_read_node_nm(c, zt, node);
+                return 1;
-                goto out;
+        /* Finally we can check lnum */
-        }
+        if (gced_lnum == lnum)
-        zbr = znode->zbranch[n];
+                return 1;
-        mutex_unlock(&c->tnc_mutex);
+        return 0;
-        err = ubifs_tnc_read_node(c, &zbr, node);
-        return err;
-out:
-        mutex_unlock(&c->tnc_mutex);
-        return err;
 }
 /**
@@ -1436,16 +1425,19 @@ out:
 * @lnum: LEB number is returned here
 * @offs: offset is returned here
 *
- * This function is the same as 'ubifs_tnc_lookup()' but it returns the node
+ * This function look up and reads node with key @key. The caller has to make
- * location also. See 'ubifs_tnc_lookup()'.
+ * sure the @node buffer is large enough to fit the node. Returns zero in case
+ * of success, %-ENOENT if the node was not found, and a negative error code in
+ * case of failure. The node location can be returned in @lnum and @offs.
 */
 int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
                     void *node, int *lnum, int *offs)
 {
-        int found, n, err;
+        int found, n, err, safely = 0, gc_seq1;
        struct ubifs_znode *znode;
        struct ubifs_zbranch zbr, *zt;
+again:
        mutex_lock(&c->tnc_mutex);
        found = ubifs_lookup_level0(c, key, &znode, &n);
        if (!found) {
@@ -1456,24 +1448,43 @@ int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
                goto out;
        }
        zt = &znode->zbranch[n];
+        if (lnum) {
+                *lnum = zt->lnum;
+                *offs = zt->offs;
+        }
        if (is_hash_key(c, key)) {
                /*
                 * In this case the leaf node cache gets used, so we pass the
                 * address of the zbranch and keep the mutex locked
                 */
-                *lnum = zt->lnum;
-                *offs = zt->offs;
                err = tnc_read_node_nm(c, zt, node);
                goto out;
        }
+        if (safely) {
+                err = ubifs_tnc_read_node(c, zt, node);
+                goto out;
+        }
+        /* Drop the TNC mutex prematurely and race with garbage collection */
        zbr = znode->zbranch[n];
+        gc_seq1 = c->gc_seq;
        mutex_unlock(&c->tnc_mutex);
-        *lnum = zbr.lnum;
+        if (ubifs_get_wbuf(c, zbr.lnum)) {
-        *offs = zbr.offs;
+                /* We do not GC journal heads */
+                err = ubifs_tnc_read_node(c, &zbr, node);
+                return err;
+        }
-        err = ubifs_tnc_read_node(c, &zbr, node);
+        err = fallible_read_node(c, key, &zbr, node);
-        return err;
+        if (err <= 0 || maybe_leb_gced(c, zbr.lnum, gc_seq1)) {
+                /*
+                 * The node may have been GC'ed out from under us so try again
+                 * while keeping the TNC mutex locked.
+                 */
+                safely = 1;
+                goto again;
+        }
+        return 0;
 out:
        mutex_unlock(&c->tnc_mutex);
@@ -1498,7 +1509,6 @@ static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
 {
        int found, n, err;
        struct ubifs_znode *znode;
-        struct ubifs_zbranch zbr;
        dbg_tnc("name '%.*s' key %s", nm->len, nm->name, DBGKEY(key));
        mutex_lock(&c->tnc_mutex);
@@ -1522,11 +1532,7 @@ static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
                goto out_unlock;
        }
-        zbr = znode->zbranch[n];
+        err = tnc_read_node_nm(c, &znode->zbranch[n], node);
-        mutex_unlock(&c->tnc_mutex);
-        err = tnc_read_node_nm(c, &zbr, node);
-        return err;
 out_unlock:
        mutex_unlock(&c->tnc_mutex);
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index bd2121f3426e..a9ecbd9af20d 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -87,7 +87,7 @@
 #define UBIFS_SK_LEN 8
 /* Minimum index tree fanout */
-#define UBIFS_MIN_FANOUT 2
+#define UBIFS_MIN_FANOUT 3
 /* Maximum number of levels in UBIFS indexing B-tree */
 #define UBIFS_MAX_LEVELS 512
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index d7f706f7a302..17c620b93eec 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -995,6 +995,9 @@ struct ubifs_mount_opts {
 * @max_idx_node_sz: maximum indexing node aligned on 8-bytes boundary
 * @max_inode_sz: maximum possible inode size in bytes
 * @max_znode_sz: size of znode in bytes
+ *
+ * @leb_overhead: how many bytes are wasted in an LEB when it is filled with
+ *                data nodes of maximum size - used in free space reporting
 * @dead_wm: LEB dead space watermark
 * @dark_wm: LEB dark space watermark
 * @block_cnt: count of 4KiB blocks on the FS
@@ -1028,6 +1031,8 @@ struct ubifs_mount_opts {
 * @sbuf: a buffer of LEB size used by GC and replay for scanning
 * @idx_gc: list of index LEBs that have been garbage collected
 * @idx_gc_cnt: number of elements on the idx_gc list
+ * @gc_seq: incremented for every non-index LEB garbage collected
+ * @gced_lnum: last non-index LEB that was garbage collected
 *
 * @infos_list: links all 'ubifs_info' objects
 * @umount_mutex: serializes shrinker and un-mount
@@ -1224,6 +1229,8 @@ struct ubifs_info {
        int max_idx_node_sz;
        long long max_inode_sz;
        int max_znode_sz;
+        int leb_overhead;
        int dead_wm;
        int dark_wm;
        int block_cnt;
@@ -1257,6 +1264,8 @@ struct ubifs_info {
        void *sbuf;
        struct list_head idx_gc;
        int idx_gc_cnt;
+        volatile int gc_seq;
+        volatile int gced_lnum;
        struct list_head infos_list;
        struct mutex umount_mutex;
@@ -1434,9 +1443,10 @@ void ubifs_release_ino_dirty(struct ubifs_info *c, struct inode *inode,
                                struct ubifs_budget_req *req);
 void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode,
                         struct ubifs_budget_req *req);
-long long ubifs_budg_get_free_space(struct ubifs_info *c);
+long long ubifs_get_free_space(struct ubifs_info *c);
 int ubifs_calc_min_idx_lebs(struct ubifs_info *c);
 void ubifs_convert_page_budget(struct ubifs_info *c);
+long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free);
 long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs);
 /* find.c */
@@ -1451,8 +1461,6 @@ int ubifs_save_dirty_idx_lnums(struct ubifs_info *c);
 /* tnc.c */
 int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
                        struct ubifs_znode **zn, int *n);
-int ubifs_tnc_lookup(struct ubifs_info *c, const union ubifs_key *key,
-                     void *node);
 int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
                        void *node, const struct qstr *nm);
 int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 0ed6e146a0d9..eb91f3b70320 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -211,6 +211,7 @@ const struct file_operations udf_file_operations = {
        .release                = udf_release_file,
        .fsync                  = udf_fsync_file,
        .splice_read            = generic_file_splice_read,
+        .llseek                 = generic_file_llseek,
 };
 const struct inode_operations udf_file_inode_operations = {
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index eb9cfa23dc3d..a4f2b3ce45b0 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -76,11 +76,24 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
        *err = -ENOSPC;
        iinfo = UDF_I(inode);
-        iinfo->i_unique = 0;
+        if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_EXTENDED_FE)) {
-        iinfo->i_lenExtents = 0;
+                iinfo->i_efe = 1;
-        iinfo->i_next_alloc_block = 0;
+                if (UDF_VERS_USE_EXTENDED_FE > sbi->s_udfrev)
-        iinfo->i_next_alloc_goal = 0;
+                        sbi->s_udfrev = UDF_VERS_USE_EXTENDED_FE;
-        iinfo->i_strat4096 = 0;
+                iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize -
+                                            sizeof(struct extendedFileEntry),
+                                            GFP_KERNEL);
+        } else {
+                iinfo->i_efe = 0;
+                iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize -
+                                            sizeof(struct fileEntry),
+                                            GFP_KERNEL);
+        }
+        if (!iinfo->i_ext.i_data) {
+                iput(inode);
+                *err = -ENOMEM;
+                return NULL;
+        }
        block = udf_new_block(dir->i_sb, NULL,
                              dinfo->i_location.partitionReferenceNum,
@@ -111,6 +124,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
                lvhd->uniqueID = cpu_to_le64(uniqueID);
                mark_buffer_dirty(sbi->s_lvid_bh);
        }
+        mutex_unlock(&sbi->s_alloc_mutex);
        inode->i_mode = mode;
        inode->i_uid = current->fsuid;
        if (dir->i_mode & S_ISGID) {
@@ -129,25 +143,6 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
        iinfo->i_lenEAttr = 0;
        iinfo->i_lenAlloc = 0;
        iinfo->i_use = 0;
-        if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_EXTENDED_FE)) {
-                iinfo->i_efe = 1;
-                if (UDF_VERS_USE_EXTENDED_FE > sbi->s_udfrev)
-                        sbi->s_udfrev = UDF_VERS_USE_EXTENDED_FE;
-                iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize -
-                                            sizeof(struct extendedFileEntry),
-                                            GFP_KERNEL);
-        } else {
-                iinfo->i_efe = 0;
-                iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize -
-                                            sizeof(struct fileEntry),
-                                            GFP_KERNEL);
-        }
-        if (!iinfo->i_ext.i_data) {
-                iput(inode);
-                *err = -ENOMEM;
-                mutex_unlock(&sbi->s_alloc_mutex);
-                return NULL;
-        }
        if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_AD_IN_ICB))
                iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;
        else if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD))
@@ -158,7 +153,6 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
                iinfo->i_crtime = current_fs_time(inode->i_sb);
        insert_inode_hash(inode);
        mark_inode_dirty(inode);
-        mutex_unlock(&sbi->s_alloc_mutex);
        if (DQUOT_ALLOC_INODE(inode)) {
                DQUOT_DROP(inode);
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index f42f80a3b1fa..a44d68eb50b5 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1338,6 +1338,10 @@ __xfs_get_blocks(
        offset = (xfs_off_t)iblock << inode->i_blkbits;
        ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
        size = bh_result->b_size;
+        if (!create && direct && offset >= i_size_read(inode))
+                return 0;
        error = xfs_iomap(XFS_I(inode), offset, size,
                             create ? flags : BMAPI_READ, &iomap, &niomap);
        if (error)
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 986061ae1b9b..36d5fcd3f593 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1001,12 +1001,13 @@ xfs_buf_iodone_work(
         * We can get an EOPNOTSUPP to ordered writes.  Here we clear the
         * ordered flag and reissue them.  Because we can't tell the higher
         * layers directly that they should not issue ordered I/O anymore, they
-         * need to check if the ordered flag was cleared during I/O completion.
+         * need to check if the _XFS_BARRIER_FAILED flag was set during I/O completion.
         */
        if ((bp->b_error == EOPNOTSUPP) &&
            (bp->b_flags & (XBF_ORDERED|XBF_ASYNC)) == (XBF_ORDERED|XBF_ASYNC)) {
                XB_TRACE(bp, "ordered_retry", bp->b_iodone);
                bp->b_flags &= ~XBF_ORDERED;
+                bp->b_flags |= _XFS_BARRIER_FAILED;
                xfs_buf_iorequest(bp);
        } else if (bp->b_iodone)
                (*(bp->b_iodone))(bp);
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index fe0109956656..456519a088c7 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -85,6 +85,14 @@ typedef enum {
         * modifications being lost.
         */
        _XBF_PAGE_LOCKED = (1 << 22),
+        /*
+         * If we try a barrier write, but it fails we have to communicate
+         * this to the upper layers.  Unfortunately b_error gets overwritten
+         * when the buffer is re-issued so we have to add another flag to
+         * keep this information.
+         */
+        _XFS_BARRIER_FAILED = (1 << 23),
 } xfs_buf_flags_t;
 typedef enum {
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 73c65f19e549..18d3c8487835 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1302,9 +1302,29 @@ xfs_fs_remount(
                        mp->m_flags &= ~XFS_MOUNT_BARRIER;
                        break;
                default:
+                        /*
+                         * Logically we would return an error here to prevent
+                         * users from believing they might have changed
+                         * mount options using remount which can't be changed.
+                         *
+                         * But unfortunately mount(8) adds all options from
+                         * mtab and fstab to the mount arguments in some cases
+                         * so we can't blindly reject options, but have to
+                         * check for each specified option if it actually
+                         * differs from the currently set option and only
+                         * reject it if that's the case.
+                         *
+                         * Until that is implemented we return success for
+                         * every remount request, and silently ignore all
+                         * options that we can't actually change.
+                         */
+#if 0
                        printk(KERN_INFO
        "XFS: mount option \"%s\" not supported for remount\n", p);
                        return -EINVAL;
+#else
+                        return 0;
+#endif
                }
        }
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 608c30c3f76b..002fc2617c8e 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -732,6 +732,7 @@ xfs_buf_item_init(
        bip->bli_item.li_ops = &xfs_buf_item_ops;
        bip->bli_item.li_mountp = mp;
        bip->bli_buf = bp;
+        xfs_buf_hold(bp);
        bip->bli_format.blf_type = XFS_LI_BUF;
        bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp);
        bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp));
@@ -867,6 +868,21 @@ xfs_buf_item_dirty(
        return (bip->bli_flags & XFS_BLI_DIRTY);
 }
+STATIC void
+xfs_buf_item_free(
+        xfs_buf_log_item_t      *bip)
+{
+#ifdef XFS_TRANS_DEBUG
+        kmem_free(bip->bli_orig);
+        kmem_free(bip->bli_logged);
+#endif /* XFS_TRANS_DEBUG */
+#ifdef XFS_BLI_TRACE
+        ktrace_free(bip->bli_trace);
+#endif
+        kmem_zone_free(xfs_buf_item_zone, bip);
+}
 /*
 * This is called when the buf log item is no longer needed.  It should
 * free the buf log item associated with the given buffer and clear
@@ -887,18 +903,8 @@ xfs_buf_item_relse(
            (XFS_BUF_IODONE_FUNC(bp) != NULL)) {
                XFS_BUF_CLR_IODONE_FUNC(bp);
        }
+        xfs_buf_rele(bp);
-#ifdef XFS_TRANS_DEBUG
+        xfs_buf_item_free(bip);
-        kmem_free(bip->bli_orig);
-        bip->bli_orig = NULL;
-        kmem_free(bip->bli_logged);
-        bip->bli_logged = NULL;
-#endif /* XFS_TRANS_DEBUG */
-#ifdef XFS_BLI_TRACE
-        ktrace_free(bip->bli_trace);
-#endif
-        kmem_zone_free(xfs_buf_item_zone, bip);
 }
@@ -1120,6 +1126,7 @@ xfs_buf_iodone(
        ASSERT(bip->bli_buf == bp);
+        xfs_buf_rele(bp);
        mp = bip->bli_item.li_mountp;
        /*
@@ -1136,18 +1143,7 @@ xfs_buf_iodone(
         * xfs_trans_delete_ail() drops the AIL lock.
         */
        xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip);
+        xfs_buf_item_free(bip);
-#ifdef XFS_TRANS_DEBUG
-        kmem_free(bip->bli_orig);
-        bip->bli_orig = NULL;
-        kmem_free(bip->bli_logged);
-        bip->bli_logged = NULL;
-#endif /* XFS_TRANS_DEBUG */
-#ifdef XFS_BLI_TRACE
-        ktrace_free(bip->bli_trace);
-#endif
-        kmem_zone_free(xfs_buf_item_zone, bip);
 }
 #if defined(XFS_BLI_TRACE)
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 760f4c5b5160..75b0cd4da0ea 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -149,7 +149,14 @@ xfs_swap_extents(
        sbp = &sxp->sx_stat;
-        xfs_lock_two_inodes(ip, tip, lock_flags);
+        /*
+         * we have to do two separate lock calls here to keep lockdep
+         * happy. If we try to get all the locks in one call, lock will
+         * report false positives when we drop the ILOCK and regain them
+         * below.
+         */
+        xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
+        xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
        locked = 1;
        /* Verify that both files have the same format */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 00e80df9dd9d..dbd9cef852ec 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -4118,7 +4118,7 @@ xfs_iext_indirect_to_direct(
        ASSERT(nextents <= XFS_LINEAR_EXTS);
        size = nextents * sizeof(xfs_bmbt_rec_t);
-        xfs_iext_irec_compact_full(ifp);
+        xfs_iext_irec_compact_pages(ifp);
        ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ);
        ep = ifp->if_u1.if_ext_irec->er_extbuf;
@@ -4449,8 +4449,7 @@ xfs_iext_irec_remove(
 * compaction policy is as follows:
 *
 *    Full Compaction: Extents fit into a single page (or inline buffer)
- *    Full Compaction: Extents occupy less than 10% of allocated space
+ * Partial Compaction: Extents occupy less than 50% of allocated space
- * Partial Compaction: Extents occupy > 10% and < 50% of allocated space
 *      No Compaction: Extents occupy at least 50% of allocated space
 */
 void
@@ -4471,8 +4470,6 @@ xfs_iext_irec_compact(
                xfs_iext_direct_to_inline(ifp, nextents);
        } else if (nextents <= XFS_LINEAR_EXTS) {
                xfs_iext_indirect_to_direct(ifp);
-        } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 3) {
-                xfs_iext_irec_compact_full(ifp);
        } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) {
                xfs_iext_irec_compact_pages(ifp);
        }
@@ -4496,7 +4493,7 @@ xfs_iext_irec_compact_pages(
                erp_next = erp + 1;
                if (erp_next->er_extcount <=
                    (XFS_LINEAR_EXTS - erp->er_extcount)) {
-                        memmove(&erp->er_extbuf[erp->er_extcount],
+                        memcpy(&erp->er_extbuf[erp->er_extcount],
                                erp_next->er_extbuf, erp_next->er_extcount *
                                sizeof(xfs_bmbt_rec_t));
                        erp->er_extcount += erp_next->er_extcount;
@@ -4516,91 +4513,6 @@ xfs_iext_irec_compact_pages(
 }
 /*
- * Fully compact the extent records managed by the indirection array.
- */
-void
-xfs_iext_irec_compact_full(
-        xfs_ifork_t     *ifp)                   /* inode fork pointer */
-{
-        xfs_bmbt_rec_host_t *ep, *ep_next;      /* extent record pointers */
-        xfs_ext_irec_t  *erp, *erp_next;        /* extent irec pointers */
-        int             erp_idx = 0;            /* extent irec index */
-        int             ext_avail;              /* empty entries in ex list */
-        int             ext_diff;               /* number of exts to add */
-        int             nlists;                 /* number of irec's (ex lists) */
-        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-        erp = ifp->if_u1.if_ext_irec;
-        ep = &erp->er_extbuf[erp->er_extcount];
-        erp_next = erp + 1;
-        ep_next = erp_next->er_extbuf;
-        while (erp_idx < nlists - 1) {
-                /*
-                 * Check how many extent records are available in this irec.
-                 * If there is none skip the whole exercise.
-                 */
-                ext_avail = XFS_LINEAR_EXTS - erp->er_extcount;
-                if (ext_avail) {
-                        /*
-                         * Copy over as many as possible extent records into
-                         * the previous page.
-                         */
-                        ext_diff = MIN(ext_avail, erp_next->er_extcount);
-                        memcpy(ep, ep_next, ext_diff * sizeof(xfs_bmbt_rec_t));
-                        erp->er_extcount += ext_diff;
-                        erp_next->er_extcount -= ext_diff;
-                        /*
-                         * If the next irec is empty now we can simply
-                         * remove it.
-                         */
-                        if (erp_next->er_extcount == 0) {
-                                /*
-                                 * Free page before removing extent record
-                                 * so er_extoffs don't get modified in
-                                 * xfs_iext_irec_remove.
-                                 */
-                                kmem_free(erp_next->er_extbuf);
-                                erp_next->er_extbuf = NULL;
-                                xfs_iext_irec_remove(ifp, erp_idx + 1);
-                                erp = &ifp->if_u1.if_ext_irec[erp_idx];
-                                nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-                        /*
-                         * If the next irec is not empty move up the content
-                         * that has not been copied to the previous page to
-                         * the beggining of this one.
-                         */
-                        } else {
-                                memmove(erp_next->er_extbuf, &ep_next[ext_diff],
-                                        erp_next->er_extcount *
-                                        sizeof(xfs_bmbt_rec_t));
-                                ep_next = erp_next->er_extbuf;
-                                memset(&ep_next[erp_next->er_extcount], 0,
-                                        (XFS_LINEAR_EXTS -
-                                                erp_next->er_extcount) *
-                                        sizeof(xfs_bmbt_rec_t));
-                        }
-                }
-                if (erp->er_extcount == XFS_LINEAR_EXTS) {
-                        erp_idx++;
-                        if (erp_idx < nlists)
-                                erp = &ifp->if_u1.if_ext_irec[erp_idx];
-                        else
-                                break;
-                }
-                ep = &erp->er_extbuf[erp->er_extcount];
-                erp_next = erp + 1;
-                ep_next = erp_next->er_extbuf;
-        }
-}
-/*
 * This is called to update the er_extoff field in the indirection
 * array when extents have been added or removed from one of the
 * extent lists. erp_idx contains the irec index to begin updating
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index ccba14eb9dbe..0b02c6443551 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -124,16 +124,27 @@ STATIC void	xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog,
 STATIC int      xlog_iclogs_empty(xlog_t *log);
 #if defined(XFS_LOG_TRACE)
+#define XLOG_TRACE_LOGGRANT_SIZE        2048
+#define XLOG_TRACE_ICLOG_SIZE           256
+void
+xlog_trace_loggrant_alloc(xlog_t *log)
+{
+        log->l_grant_trace = ktrace_alloc(XLOG_TRACE_LOGGRANT_SIZE, KM_NOFS);
+}
+void
+xlog_trace_loggrant_dealloc(xlog_t *log)
+{
+        ktrace_free(log->l_grant_trace);
+}
 void
 xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string)
 {
        unsigned long cnts;
-        if (!log->l_grant_trace) {
-                log->l_grant_trace = ktrace_alloc(2048, KM_NOSLEEP);
-                if (!log->l_grant_trace)
-                        return;
-        }
        /* ticket counts are 1 byte each */
        cnts = ((unsigned long)tic->t_ocnt) | ((unsigned long)tic->t_cnt) << 8;
@@ -157,10 +168,20 @@ xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string)
 }
 void
+xlog_trace_iclog_alloc(xlog_in_core_t *iclog)
+{
+        iclog->ic_trace = ktrace_alloc(XLOG_TRACE_ICLOG_SIZE, KM_NOFS);
+}
+void
+xlog_trace_iclog_dealloc(xlog_in_core_t *iclog)
+{
+        ktrace_free(iclog->ic_trace);
+}
+void
 xlog_trace_iclog(xlog_in_core_t *iclog, uint state)
 {
-        if (!iclog->ic_trace)
-                iclog->ic_trace = ktrace_alloc(256, KM_NOFS);
        ktrace_enter(iclog->ic_trace,
                     (void *)((unsigned long)state),
                     (void *)((unsigned long)current_pid()),
@@ -170,8 +191,15 @@ xlog_trace_iclog(xlog_in_core_t *iclog, uint state)
                     (void *)NULL, (void *)NULL);
 }
 #else
+#define xlog_trace_loggrant_alloc(log)
+#define xlog_trace_loggrant_dealloc(log)
 #define xlog_trace_loggrant(log,tic,string)
+#define xlog_trace_iclog_alloc(iclog)
+#define xlog_trace_iclog_dealloc(iclog)
 #define xlog_trace_iclog(iclog,state)
 #endif /* XFS_LOG_TRACE */
@@ -1005,11 +1033,12 @@ xlog_iodone(xfs_buf_t *bp)
        l = iclog->ic_log;
        /*
-         * If the ordered flag has been removed by a lower
+         * If the _XFS_BARRIER_FAILED flag was set by a lower
-         * layer, it means the underlyin device no longer supports
+         * layer, it means the underlying device no longer supports
         * barrier I/O. Warn loudly and turn off barriers.
         */
-        if ((l->l_mp->m_flags & XFS_MOUNT_BARRIER) && !XFS_BUF_ORDERED(bp)) {
+        if (bp->b_flags & _XFS_BARRIER_FAILED) {
+                bp->b_flags &= ~_XFS_BARRIER_FAILED;
                l->l_mp->m_flags &= ~XFS_MOUNT_BARRIER;
                xfs_fs_cmn_err(CE_WARN, l->l_mp,
                                "xlog_iodone: Barriers are no longer supported"
@@ -1231,6 +1260,7 @@ xlog_alloc_log(xfs_mount_t	*mp,
        spin_lock_init(&log->l_grant_lock);
        sv_init(&log->l_flush_wait, 0, "flush_wait");
+        xlog_trace_loggrant_alloc(log);
        /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
        ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
@@ -1285,6 +1315,8 @@ xlog_alloc_log(xfs_mount_t	*mp,
                sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force");
                sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write");
+                xlog_trace_iclog_alloc(iclog);
                iclogp = &iclog->ic_next;
        }
        *iclogp = log->l_iclog;                 /* complete ring */
@@ -1565,11 +1597,7 @@ xlog_dealloc_log(xlog_t *log)
                sv_destroy(&iclog->ic_force_wait);
                sv_destroy(&iclog->ic_write_wait);
                xfs_buf_free(iclog->ic_bp);
-#ifdef XFS_LOG_TRACE
+                xlog_trace_iclog_dealloc(iclog);
-                if (iclog->ic_trace != NULL) {
-                        ktrace_free(iclog->ic_trace);
-                }
-#endif
                next_iclog = iclog->ic_next;
                kmem_free(iclog);
                iclog = next_iclog;
@@ -1578,14 +1606,7 @@ xlog_dealloc_log(xlog_t *log)
        spinlock_destroy(&log->l_grant_lock);
        xfs_buf_free(log->l_xbuf);
-#ifdef XFS_LOG_TRACE
+        xlog_trace_loggrant_dealloc(log);
-        if (log->l_trace != NULL) {
-                ktrace_free(log->l_trace);
-        }
-        if (log->l_grant_trace != NULL) {
-                ktrace_free(log->l_grant_trace);
-        }
-#endif
        log->l_mp->m_log = NULL;
        kmem_free(log);
 }       /* xlog_dealloc_log */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index c8a5b22ee3e3..e7d8f84443fa 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -448,7 +448,6 @@ typedef struct log {
        int                     l_grant_write_bytes;
 #ifdef XFS_LOG_TRACE
-        struct ktrace           *l_trace;
        struct ktrace           *l_grant_trace;
 #endif
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index aa238c8fbd7a..8b6812f66a15 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1838,6 +1838,12 @@ again:
 #endif
 }
+/*
+ * xfs_lock_two_inodes() can only be used to lock one type of lock
+ * at a time - the iolock or the ilock, but not both at once. If
+ * we lock both at once, lockdep will report false positives saying
+ * we have violated locking orders.
+ */
 void
 xfs_lock_two_inodes(
        xfs_inode_t             *ip0,
@@ -1848,6 +1854,8 @@ xfs_lock_two_inodes(
        int                     attempts = 0;
        xfs_log_item_t          *lp;
+        if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
+                ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0);
        ASSERT(ip0->i_ino != ip1->i_ino);
        if (ip0->i_ino > ip1->i_ino) {
@@ -3152,6 +3160,13 @@ error1:	/* Just cancel transaction */
 /*
 * Zero file bytes between startoff and endoff inclusive.
 * The iolock is held exclusive and no blocks are buffered.
+ *
+ * This function is used by xfs_free_file_space() to zero
+ * partial blocks when the range to free is not block aligned.
+ * When unreserving space with boundaries that are not block
+ * aligned we round up the start and round down the end
+ * boundaries and then use this function to zero the parts of
+ * the blocks that got dropped during the rounding.
 */
 STATIC int
 xfs_zero_remaining_bytes(
@@ -3168,6 +3183,17 @@ xfs_zero_remaining_bytes(
        int                     nimap;
        int                     error = 0;
+        /*
+         * Avoid doing I/O beyond eof - it's not necessary
+         * since nothing can read beyond eof.  The space will
+         * be zeroed when the file is extended anyway.
+         */
+        if (startoff >= ip->i_size)
+                return 0;
+        if (endoff > ip->i_size)
+                endoff = ip->i_size;
        bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize,
                                XFS_IS_REALTIME_INODE(ip) ?
                                mp->m_rtdev_targp : mp->m_ddev_targp);