28 files changed, 449 insertions, 397 deletions
diff --git a/fs/afs/netdevices.c b/fs/afs/netdevices.c
index 49f189423063..7ad36506c256 100644
--- a/fs/afs/netdevices.c
+++ b/fs/afs/netdevices.c
@@ -20,8 +20,7 @@ int afs_get_MAC_address(u8 *mac, size_t maclen)
        struct net_device *dev;
        int ret = -ENODEV;
-        if (maclen != ETH_ALEN)
+        BUG_ON(maclen != ETH_ALEN);
-                BUG();
        rtnl_lock();
        dev = __dev_getfirstbyhwtype(&init_net, ARPHRD_ETHER);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index b43b95563663..acf678831103 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -590,9 +590,8 @@ static int ext2_get_blocks(struct inode *inode,
        if (depth == 0)
                return (err);
-reread:
-        partial = ext2_get_branch(inode, depth, offsets, chain, &err);
+        partial = ext2_get_branch(inode, depth, offsets, chain, &err);
        /* Simplest case - block found, no allocation needed */
        if (!partial) {
                first_block = le32_to_cpu(chain[depth - 1].key);
@@ -602,15 +601,16 @@ reread:
                while (count < maxblocks && count <= blocks_to_boundary) {
                        ext2_fsblk_t blk;
-                        if (!verify_chain(chain, partial)) {
+                        if (!verify_chain(chain, chain + depth - 1)) {
                                /*
                                 * Indirect block might be removed by
                                 * truncate while we were reading it.
                                 * Handling of that case: forget what we've
                                 * got now, go to reread.
                                 */
+                                err = -EAGAIN;
                                count = 0;
-                                goto changed;
+                                break;
                        }
                        blk = le32_to_cpu(*(chain[depth-1].p + count));
                        if (blk == first_block + count)
@@ -618,7 +618,8 @@ reread:
                        else
                                break;
                }
-                goto got_it;
+                if (err != -EAGAIN)
+                        goto got_it;
        }
        /* Next simple case - plain lookup or failed read of indirect block */
@@ -626,6 +627,33 @@ reread:
                goto cleanup;
        mutex_lock(&ei->truncate_mutex);
+        /*
+         * If the indirect block is missing while we are reading
+         * the chain(ext3_get_branch() returns -EAGAIN err), or
+         * if the chain has been changed after we grab the semaphore,
+         * (either because another process truncated this branch, or
+         * another get_block allocated this branch) re-grab the chain to see if
+         * the request block has been allocated or not.
+         *
+         * Since we already block the truncate/other get_block
+         * at this point, we will have the current copy of the chain when we
+         * splice the branch into the tree.
+         */
+        if (err == -EAGAIN || !verify_chain(chain, partial)) {
+                while (partial > chain) {
+                        brelse(partial->bh);
+                        partial--;
+                }
+                partial = ext2_get_branch(inode, depth, offsets, chain, &err);
+                if (!partial) {
+                        count++;
+                        mutex_unlock(&ei->truncate_mutex);
+                        if (err)
+                                goto cleanup;
+                        clear_buffer_new(bh_result);
+                        goto got_it;
+                }
+        }
        /*
         * Okay, we need to do block allocation.  Lazily initialize the block
@@ -683,12 +711,6 @@ cleanup:
                partial--;
        }
        return err;
-changed:
-        while (partial > chain) {
-                brelse(partial->bh);
-                partial--;
-        }
-        goto reread;
 }
 int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index ac77d8b8251d..6132353dcf62 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -342,7 +342,7 @@ static int ext4_valid_extent_idx(struct inode *inode,
        ext4_fsblk_t block = idx_pblock(ext_idx);
        struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
        if (unlikely(block < le32_to_cpu(es->s_first_data_block) ||
-                        (block > ext4_blocks_count(es))))
+                        (block >= ext4_blocks_count(es))))
                return 0;
        else
                return 1;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index a2e7952bc5f9..c6bd6ced3bb7 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -372,16 +372,16 @@ static int ext4_block_to_path(struct inode *inode,
 }
 static int __ext4_check_blockref(const char *function, struct inode *inode,
-                                 unsigned int *p, unsigned int max) {
+                                 __le32 *p, unsigned int max) {
        unsigned int maxblocks = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es);
-        unsigned int *bref = p;
+        __le32 *bref = p;
        while (bref < p+max) {
-                if (unlikely(*bref >= maxblocks)) {
+                if (unlikely(le32_to_cpu(*bref) >= maxblocks)) {
                        ext4_error(inode->i_sb, function,
                                   "block reference %u >= max (%u) "
                                   "in inode #%lu, offset=%d",
-                                   *bref, maxblocks,
+                                   le32_to_cpu(*bref), maxblocks,
                                   inode->i_ino, (int)(bref-p));
                        return -EIO;
                }
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9987bba99db3..2958f4e6f222 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2508,6 +2508,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
                goto cantfind_ext4;
+        /* check blocks count against device size */
+        blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
+        if (blocks_count && ext4_blocks_count(es) > blocks_count) {
+                printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu "
+                       "exceeds size of device (%llu blocks)\n",
+                       ext4_blocks_count(es), blocks_count);
+                goto failed_mount;
+        }
        /*
         * It makes no sense for the first data block to be beyond the end
         * of the filesystem.
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 9435dda8f1e0..a1cbff2b4d99 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -70,6 +70,10 @@ static int hfs_releasepage(struct page *page, gfp_t mask)
                BUG();
                return 0;
        }
+        if (!tree)
+                return 0;
        if (tree->node_size >= PAGE_CACHE_SIZE) {
                nidx = page->index >> (tree->node_size_shift - PAGE_CACHE_SHIFT);
                spin_lock(&tree->hash_lock);
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 36ca2e1a4fa3..7b6165f25fbe 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -349,6 +349,7 @@ void hfs_mdb_put(struct super_block *sb)
        if (HFS_SB(sb)->nls_disk)
                unload_nls(HFS_SB(sb)->nls_disk);
+        free_pages((unsigned long)HFS_SB(sb)->bitmap, PAGE_SIZE < 8192 ? 1 : 0);
        kfree(HFS_SB(sb));
        sb->s_fs_info = NULL;
 }
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index c7bd649bbbdc..3e9afc2a91d2 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -55,6 +55,25 @@
 *                      need do nothing.
 * RevokeValid set, Revoked set:
 *                      buffer has been revoked.
+ *
+ * Locking rules:
+ * We keep two hash tables of revoke records. One hashtable belongs to the
+ * running transaction (is pointed to by journal->j_revoke), the other one
+ * belongs to the committing transaction. Accesses to the second hash table
+ * happen only from the kjournald and no other thread touches this table.  Also
+ * journal_switch_revoke_table() which switches which hashtable belongs to the
+ * running and which to the committing transaction is called only from
+ * kjournald. Therefore we need no locks when accessing the hashtable belonging
+ * to the committing transaction.
+ *
+ * All users operating on the hash table belonging to the running transaction
+ * have a handle to the transaction. Therefore they are safe from kjournald
+ * switching hash tables under them. For operations on the lists of entries in
+ * the hash table j_revoke_lock is used.
+ *
+ * Finally, also replay code uses the hash tables but at this moment noone else
+ * can touch them (filesystem isn't mounted yet) and hence no locking is
+ * needed.
 */
 #ifndef __KERNEL__
@@ -402,8 +421,6 @@ int journal_revoke(handle_t *handle, unsigned long blocknr,
 * the second time we would still have a pending revoke to cancel.  So,
 * do not trust the Revoked bit on buffers unless RevokeValid is also
 * set.
- *
- * The caller must have the journal locked.
 */
 int journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
 {
@@ -481,10 +498,7 @@ void journal_switch_revoke_table(journal_t *journal)
 /*
 * Write revoke records to the journal for all entries in the current
 * revoke hash, deleting the entries as we go.
- *
- * Called with the journal lock held.
 */
 void journal_write_revoke_records(journal_t *journal,
                                  transaction_t *transaction)
 {
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 24638e059bf3..064279e33bbb 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -688,6 +688,8 @@ static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_gc = {
        .bpop_translate         =       NULL,
 };
+static struct lock_class_key nilfs_bmap_dat_lock_key;
 /**
 * nilfs_bmap_read - read a bmap from an inode
 * @bmap: bmap
@@ -715,6 +717,7 @@ int nilfs_bmap_read(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode)
                bmap->b_pops = &nilfs_bmap_ptr_ops_p;
                bmap->b_last_allocated_key = 0; /* XXX: use macro */
                bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT;
+                lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
                break;
        case NILFS_CPFILE_INO:
        case NILFS_SUFILE_INO:
@@ -772,6 +775,7 @@ void nilfs_bmap_init_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap)
 {
        memcpy(gcbmap, bmap, sizeof(union nilfs_bmap_union));
        init_rwsem(&gcbmap->b_sem);
+        lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
        gcbmap->b_inode = &NILFS_BMAP_I(gcbmap)->vfs_inode;
 }
@@ -779,5 +783,6 @@ void nilfs_bmap_commit_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap)
 {
        memcpy(bmap, gcbmap, sizeof(union nilfs_bmap_union));
        init_rwsem(&bmap->b_sem);
+        lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
        bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
 }
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 7558c977db02..3d0c18a16db1 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -35,11 +35,6 @@
 #include "bmap_union.h"
 /*
- * NILFS filesystem version
- */
-#define NILFS_VERSION           "2.0.5"
-/*
 * nilfs inode data in memory
 */
 struct nilfs_inode_info {
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 6ade0963fc1d..4fc081e47d70 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -413,7 +413,6 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
        struct nilfs_segment_entry *ent, *n;
        struct inode *sufile = nilfs->ns_sufile;
        __u64 segnum[4];
-        time_t mtime;
        int err;
        int i;
@@ -442,24 +441,13 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
         * Collecting segments written after the latest super root.
         * These are marked dirty to avoid being reallocated in the next write.
         */
-        mtime = get_seconds();
        list_for_each_entry_safe(ent, n, head, list) {
-                if (ent->segnum == segnum[0]) {
+                if (ent->segnum != segnum[0]) {
-                        list_del(&ent->list);
+                        err = nilfs_sufile_scrap(sufile, ent->segnum);
-                        nilfs_free_segment_entry(ent);
+                        if (unlikely(err))
-                        continue;
+                                goto failed;
-                }
-                err = nilfs_open_segment_entry(ent, sufile);
-                if (unlikely(err))
-                        goto failed;
-                if (!nilfs_segment_usage_dirty(ent->raw_su)) {
-                        /* make the segment garbage */
-                        ent->raw_su->su_nblocks = cpu_to_le32(0);
-                        ent->raw_su->su_lastmod = cpu_to_le32(mtime);
-                        nilfs_segment_usage_set_dirty(ent->raw_su);
                }
                list_del(&ent->list);
-                nilfs_close_segment_entry(ent, sufile);
                nilfs_free_segment_entry(ent);
        }
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index c774cf397e2f..98e68677f045 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -93,6 +93,52 @@ nilfs_sufile_get_segment_usage_block(struct inode *sufile, __u64 segnum,
                                   create, NULL, bhp);
 }
+static void nilfs_sufile_mod_counter(struct buffer_head *header_bh,
+                                     u64 ncleanadd, u64 ndirtyadd)
+{
+        struct nilfs_sufile_header *header;
+        void *kaddr;
+        kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+        header = kaddr + bh_offset(header_bh);
+        le64_add_cpu(&header->sh_ncleansegs, ncleanadd);
+        le64_add_cpu(&header->sh_ndirtysegs, ndirtyadd);
+        kunmap_atomic(kaddr, KM_USER0);
+        nilfs_mdt_mark_buffer_dirty(header_bh);
+}
+int nilfs_sufile_update(struct inode *sufile, __u64 segnum, int create,
+                        void (*dofunc)(struct inode *, __u64,
+                                       struct buffer_head *,
+                                       struct buffer_head *))
+{
+        struct buffer_head *header_bh, *bh;
+        int ret;
+        if (unlikely(segnum >= nilfs_sufile_get_nsegments(sufile))) {
+                printk(KERN_WARNING "%s: invalid segment number: %llu\n",
+                       __func__, (unsigned long long)segnum);
+                return -EINVAL;
+        }
+        down_write(&NILFS_MDT(sufile)->mi_sem);
+        ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+        if (ret < 0)
+                goto out_sem;
+        ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, create, &bh);
+        if (!ret) {
+                dofunc(sufile, segnum, header_bh, bh);
+                brelse(bh);
+        }
+        brelse(header_bh);
+ out_sem:
+        up_write(&NILFS_MDT(sufile)->mi_sem);
+        return ret;
+}
 /**
 * nilfs_sufile_alloc - allocate a segment
 * @sufile: inode of segment usage file
@@ -113,7 +159,6 @@ nilfs_sufile_get_segment_usage_block(struct inode *sufile, __u64 segnum,
 int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
 {
        struct buffer_head *header_bh, *su_bh;
-        struct the_nilfs *nilfs;
        struct nilfs_sufile_header *header;
        struct nilfs_segment_usage *su;
        size_t susz = NILFS_MDT(sufile)->mi_entry_size;
@@ -124,8 +169,6 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
        down_write(&NILFS_MDT(sufile)->mi_sem);
-        nilfs = NILFS_MDT(sufile)->mi_nilfs;
        ret = nilfs_sufile_get_header_block(sufile, &header_bh);
        if (ret < 0)
                goto out_sem;
@@ -192,165 +235,84 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
        return ret;
 }
-/**
+void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum,
- * nilfs_sufile_cancel_free -
+                                 struct buffer_head *header_bh,
- * @sufile: inode of segment usage file
+                                 struct buffer_head *su_bh)
- * @segnum: segment number
- *
- * Description:
- *
- * Return Value: On success, 0 is returned. On error, one of the following
- * negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- */
-int nilfs_sufile_cancel_free(struct inode *sufile, __u64 segnum)
 {
-        struct buffer_head *header_bh, *su_bh;
-        struct the_nilfs *nilfs;
-        struct nilfs_sufile_header *header;
        struct nilfs_segment_usage *su;
        void *kaddr;
-        int ret;
-        down_write(&NILFS_MDT(sufile)->mi_sem);
-        nilfs = NILFS_MDT(sufile)->mi_nilfs;
-        ret = nilfs_sufile_get_header_block(sufile, &header_bh);
-        if (ret < 0)
-                goto out_sem;
-        ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &su_bh);
-        if (ret < 0)
-                goto out_header;
        kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
-        su = nilfs_sufile_block_get_segment_usage(
+        su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
-                sufile, segnum, su_bh, kaddr);
        if (unlikely(!nilfs_segment_usage_clean(su))) {
                printk(KERN_WARNING "%s: segment %llu must be clean\n",
                       __func__, (unsigned long long)segnum);
                kunmap_atomic(kaddr, KM_USER0);
-                goto out_su_bh;
+                return;
        }
        nilfs_segment_usage_set_dirty(su);
        kunmap_atomic(kaddr, KM_USER0);
-        kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+        nilfs_sufile_mod_counter(header_bh, -1, 1);
-        header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr);
-        le64_add_cpu(&header->sh_ncleansegs, -1);
-        le64_add_cpu(&header->sh_ndirtysegs, 1);
-        kunmap_atomic(kaddr, KM_USER0);
-        nilfs_mdt_mark_buffer_dirty(header_bh);
        nilfs_mdt_mark_buffer_dirty(su_bh);
        nilfs_mdt_mark_dirty(sufile);
- out_su_bh:
-        brelse(su_bh);
- out_header:
-        brelse(header_bh);
- out_sem:
-        up_write(&NILFS_MDT(sufile)->mi_sem);
-        return ret;
 }
-/**
+void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum,
- * nilfs_sufile_freev - free segments
+                           struct buffer_head *header_bh,
- * @sufile: inode of segment usage file
+                           struct buffer_head *su_bh)
- * @segnum: array of segment numbers
- * @nsegs: number of segments
- *
- * Description: nilfs_sufile_freev() frees segments specified by @segnum and
- * @nsegs, which must have been returned by a previous call to
- * nilfs_sufile_alloc().
- *
- * Return Value: On success, 0 is returned. On error, one of the following
- * negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- */
-#define NILFS_SUFILE_FREEV_PREALLOC     16
-int nilfs_sufile_freev(struct inode *sufile, __u64 *segnum, size_t nsegs)
 {
-        struct buffer_head *header_bh, **su_bh,
-                *su_bh_prealloc[NILFS_SUFILE_FREEV_PREALLOC];
-        struct the_nilfs *nilfs;
-        struct nilfs_sufile_header *header;
        struct nilfs_segment_usage *su;
        void *kaddr;
-        int ret, i;
+        int clean, dirty;
-        down_write(&NILFS_MDT(sufile)->mi_sem);
+        kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
+        su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
-        nilfs = NILFS_MDT(sufile)->mi_nilfs;
+        if (su->su_flags == cpu_to_le32(1UL << NILFS_SEGMENT_USAGE_DIRTY) &&
+            su->su_nblocks == cpu_to_le32(0)) {
-        /* prepare resources */
-        if (nsegs <= NILFS_SUFILE_FREEV_PREALLOC)
-                su_bh = su_bh_prealloc;
-        else {
-                su_bh = kmalloc(sizeof(*su_bh) * nsegs, GFP_NOFS);
-                if (su_bh == NULL) {
-                        ret = -ENOMEM;
-                        goto out_sem;
-                }
-        }
-        ret = nilfs_sufile_get_header_block(sufile, &header_bh);
-        if (ret < 0)
-                goto out_su_bh;
-        for (i = 0; i < nsegs; i++) {
-                ret = nilfs_sufile_get_segment_usage_block(sufile, segnum[i],
-                                                           0, &su_bh[i]);
-                if (ret < 0)
-                        goto out_bh;
-        }
-        /* free segments */
-        for (i = 0; i < nsegs; i++) {
-                kaddr = kmap_atomic(su_bh[i]->b_page, KM_USER0);
-                su = nilfs_sufile_block_get_segment_usage(
-                        sufile, segnum[i], su_bh[i], kaddr);
-                WARN_ON(nilfs_segment_usage_error(su));
-                nilfs_segment_usage_set_clean(su);
                kunmap_atomic(kaddr, KM_USER0);
-                nilfs_mdt_mark_buffer_dirty(su_bh[i]);
+                return;
        }
-        kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+        clean = nilfs_segment_usage_clean(su);
-        header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr);
+        dirty = nilfs_segment_usage_dirty(su);
-        le64_add_cpu(&header->sh_ncleansegs, nsegs);
-        le64_add_cpu(&header->sh_ndirtysegs, -(u64)nsegs);
+        /* make the segment garbage */
+        su->su_lastmod = cpu_to_le64(0);
+        su->su_nblocks = cpu_to_le32(0);
+        su->su_flags = cpu_to_le32(1UL << NILFS_SEGMENT_USAGE_DIRTY);
        kunmap_atomic(kaddr, KM_USER0);
-        nilfs_mdt_mark_buffer_dirty(header_bh);
+        nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1);
+        nilfs_mdt_mark_buffer_dirty(su_bh);
        nilfs_mdt_mark_dirty(sufile);
+}
- out_bh:
+void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
-        for (i--; i >= 0; i--)
+                          struct buffer_head *header_bh,
-                brelse(su_bh[i]);
+                          struct buffer_head *su_bh)
-        brelse(header_bh);
+{
+        struct nilfs_segment_usage *su;
+        void *kaddr;
+        int sudirty;
- out_su_bh:
+        kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
-        if (su_bh != su_bh_prealloc)
+        su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
-                kfree(su_bh);
+        if (nilfs_segment_usage_clean(su)) {
+                printk(KERN_WARNING "%s: segment %llu is already clean\n",
+                       __func__, (unsigned long long)segnum);
+                kunmap_atomic(kaddr, KM_USER0);
+                return;
+        }
+        WARN_ON(nilfs_segment_usage_error(su));
+        WARN_ON(!nilfs_segment_usage_dirty(su));
- out_sem:
+        sudirty = nilfs_segment_usage_dirty(su);
-        up_write(&NILFS_MDT(sufile)->mi_sem);
+        nilfs_segment_usage_set_clean(su);
-        return ret;
+        kunmap_atomic(kaddr, KM_USER0);
-}
+        nilfs_mdt_mark_buffer_dirty(su_bh);
-/**
+        nilfs_sufile_mod_counter(header_bh, 1, sudirty ? (u64)-1 : 0);
- * nilfs_sufile_free -
+        nilfs_mdt_mark_dirty(sufile);
- * @sufile:
- * @segnum:
- */
-int nilfs_sufile_free(struct inode *sufile, __u64 segnum)
-{
-        return nilfs_sufile_freev(sufile, &segnum, 1);
 }
 /**
@@ -500,72 +462,28 @@ int nilfs_sufile_get_ncleansegs(struct inode *sufile, unsigned long *nsegsp)
        return ret;
 }
-/**
+void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum,
- * nilfs_sufile_set_error - mark a segment as erroneous
+                               struct buffer_head *header_bh,
- * @sufile: inode of segment usage file
+                               struct buffer_head *su_bh)
- * @segnum: segment number
- *
- * Description: nilfs_sufile_set_error() marks the segment specified by
- * @segnum as erroneous. The error segment will never be used again.
- *
- * Return Value: On success, 0 is returned. On error, one of the following
- * negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- *
- * %-EINVAL - Invalid segment usage number.
- */
-int nilfs_sufile_set_error(struct inode *sufile, __u64 segnum)
 {
-        struct buffer_head *header_bh, *su_bh;
        struct nilfs_segment_usage *su;
-        struct nilfs_sufile_header *header;
        void *kaddr;
-        int ret;
+        int suclean;
-        if (unlikely(segnum >= nilfs_sufile_get_nsegments(sufile))) {
-                printk(KERN_WARNING "%s: invalid segment number: %llu\n",
-                       __func__, (unsigned long long)segnum);
-                return -EINVAL;
-        }
-        down_write(&NILFS_MDT(sufile)->mi_sem);
-        ret = nilfs_sufile_get_header_block(sufile, &header_bh);
-        if (ret < 0)
-                goto out_sem;
-        ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &su_bh);
-        if (ret < 0)
-                goto out_header;
        kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
        su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
        if (nilfs_segment_usage_error(su)) {
                kunmap_atomic(kaddr, KM_USER0);
-                brelse(su_bh);
+                return;
-                goto out_header;
        }
+        suclean = nilfs_segment_usage_clean(su);
        nilfs_segment_usage_set_error(su);
        kunmap_atomic(kaddr, KM_USER0);
-        brelse(su_bh);
-        kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+        if (suclean)
-        header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr);
+                nilfs_sufile_mod_counter(header_bh, -1, 0);
-        le64_add_cpu(&header->sh_ndirtysegs, -1);
-        kunmap_atomic(kaddr, KM_USER0);
-        nilfs_mdt_mark_buffer_dirty(header_bh);
        nilfs_mdt_mark_buffer_dirty(su_bh);
        nilfs_mdt_mark_dirty(sufile);
-        brelse(su_bh);
- out_header:
-        brelse(header_bh);
- out_sem:
-        up_write(&NILFS_MDT(sufile)->mi_sem);
-        return ret;
 }
 /**
@@ -625,7 +543,7 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum,
                        si[i + j].sui_nblocks = le32_to_cpu(su->su_nblocks);
                        si[i + j].sui_flags = le32_to_cpu(su->su_flags) &
                                ~(1UL << NILFS_SEGMENT_USAGE_ACTIVE);
-                        if (nilfs_segment_is_active(nilfs, segnum + i + j))
+                        if (nilfs_segment_is_active(nilfs, segnum + j))
                                si[i + j].sui_flags |=
                                        (1UL << NILFS_SEGMENT_USAGE_ACTIVE);
                }
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index d595f33a768d..a2e2efd4ade1 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -36,9 +36,6 @@ static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile)
 }
 int nilfs_sufile_alloc(struct inode *, __u64 *);
-int nilfs_sufile_cancel_free(struct inode *, __u64);
-int nilfs_sufile_freev(struct inode *, __u64 *, size_t);
-int nilfs_sufile_free(struct inode *, __u64);
 int nilfs_sufile_get_segment_usage(struct inode *, __u64,
                                   struct nilfs_segment_usage **,
                                   struct buffer_head **);
@@ -46,9 +43,83 @@ void nilfs_sufile_put_segment_usage(struct inode *, __u64,
                                    struct buffer_head *);
 int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *);
 int nilfs_sufile_get_ncleansegs(struct inode *, unsigned long *);
-int nilfs_sufile_set_error(struct inode *, __u64);
 ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, struct nilfs_suinfo *,
                                size_t);
+int nilfs_sufile_update(struct inode *, __u64, int,
+                        void (*dofunc)(struct inode *, __u64,
+                                       struct buffer_head *,
+                                       struct buffer_head *));
+void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *,
+                                 struct buffer_head *);
+void nilfs_sufile_do_scrap(struct inode *, __u64, struct buffer_head *,
+                           struct buffer_head *);
+void nilfs_sufile_do_free(struct inode *, __u64, struct buffer_head *,
+                          struct buffer_head *);
+void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *,
+                               struct buffer_head *);
+/**
+ * nilfs_sufile_cancel_free -
+ * @sufile: inode of segment usage file
+ * @segnum: segment number
+ *
+ * Description:
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+static inline int nilfs_sufile_cancel_free(struct inode *sufile, __u64 segnum)
+{
+        return nilfs_sufile_update(sufile, segnum, 0,
+                                   nilfs_sufile_do_cancel_free);
+}
+/**
+ * nilfs_sufile_scrap - make a segment garbage
+ * @sufile: inode of segment usage file
+ * @segnum: segment number to be freed
+ */
+static inline int nilfs_sufile_scrap(struct inode *sufile, __u64 segnum)
+{
+        return nilfs_sufile_update(sufile, segnum, 1, nilfs_sufile_do_scrap);
+}
+/**
+ * nilfs_sufile_free - free segment
+ * @sufile: inode of segment usage file
+ * @segnum: segment number to be freed
+ */
+static inline int nilfs_sufile_free(struct inode *sufile, __u64 segnum)
+{
+        return nilfs_sufile_update(sufile, segnum, 0, nilfs_sufile_do_free);
+}
+/**
+ * nilfs_sufile_set_error - mark a segment as erroneous
+ * @sufile: inode of segment usage file
+ * @segnum: segment number
+ *
+ * Description: nilfs_sufile_set_error() marks the segment specified by
+ * @segnum as erroneous. The error segment will never be used again.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EINVAL - Invalid segment usage number.
+ */
+static inline int nilfs_sufile_set_error(struct inode *sufile, __u64 segnum)
+{
+        return nilfs_sufile_update(sufile, segnum, 0,
+                                   nilfs_sufile_do_set_error);
+}
 #endif  /* _NILFS_SUFILE_H */
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index e117e1ea9bff..6989b03e97ab 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -63,7 +63,6 @@
 MODULE_AUTHOR("NTT Corp.");
 MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
                   "(NILFS)");
-MODULE_VERSION(NILFS_VERSION);
 MODULE_LICENSE("GPL");
 static int nilfs_remount(struct super_block *sb, int *flags, char *data);
@@ -476,11 +475,12 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *sb = dentry->d_sb;
        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
        unsigned long long blocks;
        unsigned long overhead;
        unsigned long nrsvblocks;
        sector_t nfreeblocks;
-        struct the_nilfs *nilfs = sbi->s_nilfs;
        int err;
        /*
@@ -514,6 +514,9 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_files = atomic_read(&sbi->s_inodes_count);
        buf->f_ffree = 0; /* nilfs_count_free_inodes(sb); */
        buf->f_namelen = NILFS_NAME_LEN;
+        buf->f_fsid.val[0] = (u32)id;
+        buf->f_fsid.val[1] = (u32)(id >> 32);
        return 0;
 }
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 33400cf0bbe2..7f65b3be4aa9 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -115,6 +115,7 @@ void put_nilfs(struct the_nilfs *nilfs)
 static int nilfs_load_super_root(struct the_nilfs *nilfs,
                                 struct nilfs_sb_info *sbi, sector_t sr_block)
 {
+        static struct lock_class_key dat_lock_key;
        struct buffer_head *bh_sr;
        struct nilfs_super_root *raw_sr;
        struct nilfs_super_block **sbp = nilfs->ns_sbp;
@@ -163,6 +164,9 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs,
        if (unlikely(err))
                goto failed_sufile;
+        lockdep_set_class(&NILFS_MDT(nilfs->ns_dat)->mi_sem, &dat_lock_key);
+        lockdep_set_class(&NILFS_MDT(nilfs->ns_gc_dat)->mi_sem, &dat_lock_key);
        nilfs_mdt_set_shadow(nilfs->ns_dat, nilfs->ns_gc_dat);
        nilfs_mdt_set_entry_size(nilfs->ns_cpfile, checkpoint_size,
                                 sizeof(struct nilfs_cpfile_header));
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index c13f67300fe7..7ec89fc05b2b 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -153,23 +153,6 @@ xfs_find_bdev_for_inode(
 }
 /*
- * Schedule IO completion handling on a xfsdatad if this was
- * the final hold on this ioend. If we are asked to wait,
- * flush the workqueue.
- */
-STATIC void
-xfs_finish_ioend(
-        xfs_ioend_t     *ioend,
-        int             wait)
-{
-        if (atomic_dec_and_test(&ioend->io_remaining)) {
-                queue_work(xfsdatad_workqueue, &ioend->io_work);
-                if (wait)
-                        flush_workqueue(xfsdatad_workqueue);
-        }
-}
-/*
 * We're now finished for good with this ioend structure.
 * Update the page state via the associated buffer_heads,
 * release holds on the inode and bio, and finally free
@@ -310,6 +293,27 @@ xfs_end_bio_read(
 }
 /*
+ * Schedule IO completion handling on a xfsdatad if this was
+ * the final hold on this ioend. If we are asked to wait,
+ * flush the workqueue.
+ */
+STATIC void
+xfs_finish_ioend(
+        xfs_ioend_t     *ioend,
+        int             wait)
+{
+        if (atomic_dec_and_test(&ioend->io_remaining)) {
+                struct workqueue_struct *wq = xfsdatad_workqueue;
+                if (ioend->io_work.func == xfs_end_bio_unwritten)
+                        wq = xfsconvertd_workqueue;
+                queue_work(wq, &ioend->io_work);
+                if (wait)
+                        flush_workqueue(wq);
+        }
+}
+/*
 * Allocate and initialise an IO completion structure.
 * We need to track unwritten extent write completion here initially.
 * We'll need to extend this for updating the ondisk inode size later
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index 1dd528849755..221b3e66ceef 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -19,6 +19,7 @@
 #define __XFS_AOPS_H__
 extern struct workqueue_struct *xfsdatad_workqueue;
+extern struct workqueue_struct *xfsconvertd_workqueue;
 extern mempool_t *xfs_ioend_pool;
 /*
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index aa1016bb9134..e28800a9f2b5 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -51,6 +51,7 @@ static struct shrinker xfs_buf_shake = {
 static struct workqueue_struct *xfslogd_workqueue;
 struct workqueue_struct *xfsdatad_workqueue;
+struct workqueue_struct *xfsconvertd_workqueue;
 #ifdef XFS_BUF_TRACE
 void
@@ -1775,6 +1776,7 @@ xfs_flush_buftarg(
        xfs_buf_t       *bp, *n;
        int             pincount = 0;
+        xfs_buf_runall_queues(xfsconvertd_workqueue);
        xfs_buf_runall_queues(xfsdatad_workqueue);
        xfs_buf_runall_queues(xfslogd_workqueue);
@@ -1831,9 +1833,15 @@ xfs_buf_init(void)
        if (!xfsdatad_workqueue)
                goto out_destroy_xfslogd_workqueue;
+        xfsconvertd_workqueue = create_workqueue("xfsconvertd");
+        if (!xfsconvertd_workqueue)
+                goto out_destroy_xfsdatad_workqueue;
        register_shrinker(&xfs_buf_shake);
        return 0;
+ out_destroy_xfsdatad_workqueue:
+        destroy_workqueue(xfsdatad_workqueue);
 out_destroy_xfslogd_workqueue:
        destroy_workqueue(xfslogd_workqueue);
 out_free_buf_zone:
@@ -1849,6 +1857,7 @@ void
 xfs_buf_terminate(void)
 {
        unregister_shrinker(&xfs_buf_shake);
+        destroy_workqueue(xfsconvertd_workqueue);
        destroy_workqueue(xfsdatad_workqueue);
        destroy_workqueue(xfslogd_workqueue);
        kmem_zone_destroy(xfs_buf_zone);
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index 5aeb77776961..08be36d7326c 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -74,14 +74,14 @@ xfs_flush_pages(
        if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
                xfs_iflags_clear(ip, XFS_ITRUNCATED);
-                ret = filemap_fdatawrite(mapping);
+                ret = -filemap_fdatawrite(mapping);
-                if (flags & XFS_B_ASYNC)
-                        return -ret;
-                ret2 = filemap_fdatawait(mapping);
-                if (!ret)
-                        ret = ret2;
        }
-        return -ret;
+        if (flags & XFS_B_ASYNC)
+                return ret;
+        ret2 = xfs_wait_on_pages(ip, first, last);
+        if (!ret)
+                ret = ret2;
+        return ret;
 }
 int
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 7e90daa0d1d1..9142192ccbe6 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -751,10 +751,26 @@ start:
                        goto relock;
                }
        } else {
+                int enospc = 0;
+                ssize_t ret2 = 0;
+write_retry:
                xfs_rw_enter_trace(XFS_WRITE_ENTER, xip, (void *)iovp, segs,
                                *offset, ioflags);
-                ret = generic_file_buffered_write(iocb, iovp, segs,
+                ret2 = generic_file_buffered_write(iocb, iovp, segs,
                                pos, offset, count, ret);
+                /*
+                 * if we just got an ENOSPC, flush the inode now we
+                 * aren't holding any page locks and retry *once*
+                 */
+                if (ret2 == -ENOSPC && !enospc) {
+                        error = xfs_flush_pages(xip, 0, -1, 0, FI_NONE);
+                        if (error)
+                                goto out_unlock_internal;
+                        enospc = 1;
+                        goto write_retry;
+                }
+                ret = ret2;
        }
        current->backing_dev_info = NULL;
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index a608e72fa405..f7ba76633c29 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -62,12 +62,6 @@ xfs_sync_inodes_ag(
        uint32_t        first_index = 0;
        int             error = 0;
        int             last_error = 0;
-        int             fflag = XFS_B_ASYNC;
-        if (flags & SYNC_DELWRI)
-                fflag = XFS_B_DELWRI;
-        if (flags & SYNC_WAIT)
-                fflag = 0;              /* synchronous overrides all */
        do {
                struct inode    *inode;
@@ -128,11 +122,23 @@ xfs_sync_inodes_ag(
                 * If we have to flush data or wait for I/O completion
                 * we need to hold the iolock.
                 */
-                if ((flags & SYNC_DELWRI) && VN_DIRTY(inode)) {
+                if (flags & SYNC_DELWRI) {
-                        xfs_ilock(ip, XFS_IOLOCK_SHARED);
+                        if (VN_DIRTY(inode)) {
-                        lock_flags |= XFS_IOLOCK_SHARED;
+                                if (flags & SYNC_TRYLOCK) {
-                        error = xfs_flush_pages(ip, 0, -1, fflag, FI_NONE);
+                                        if (xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
-                        if (flags & SYNC_IOWAIT)
+                                                lock_flags |= XFS_IOLOCK_SHARED;
+                                } else {
+                                        xfs_ilock(ip, XFS_IOLOCK_SHARED);
+                                        lock_flags |= XFS_IOLOCK_SHARED;
+                                }
+                                if (lock_flags & XFS_IOLOCK_SHARED) {
+                                        error = xfs_flush_pages(ip, 0, -1,
+                                                        (flags & SYNC_WAIT) ? 0
+                                                                : XFS_B_ASYNC,
+                                                        FI_NONE);
+                                }
+                        }
+                        if (VN_CACHED(inode) && (flags & SYNC_IOWAIT))
                                xfs_ioend_wait(ip);
                }
                xfs_ilock(ip, XFS_ILOCK_SHARED);
@@ -398,15 +404,17 @@ STATIC void
 xfs_syncd_queue_work(
        struct xfs_mount *mp,
        void            *data,
-        void            (*syncer)(struct xfs_mount *, void *))
+        void            (*syncer)(struct xfs_mount *, void *),
+        struct completion *completion)
 {
-        struct bhv_vfs_sync_work *work;
+        struct xfs_sync_work *work;
-        work = kmem_alloc(sizeof(struct bhv_vfs_sync_work), KM_SLEEP);
+        work = kmem_alloc(sizeof(struct xfs_sync_work), KM_SLEEP);
        INIT_LIST_HEAD(&work->w_list);
        work->w_syncer = syncer;
        work->w_data = data;
        work->w_mount = mp;
+        work->w_completion = completion;
        spin_lock(&mp->m_sync_lock);
        list_add_tail(&work->w_list, &mp->m_sync_list);
        spin_unlock(&mp->m_sync_lock);
@@ -420,49 +428,26 @@ xfs_syncd_queue_work(
 * heads, looking about for more room...
 */
 STATIC void
-xfs_flush_inode_work(
+xfs_flush_inodes_work(
-        struct xfs_mount *mp,
-        void            *arg)
-{
-        struct inode    *inode = arg;
-        filemap_flush(inode->i_mapping);
-        iput(inode);
-}
-void
-xfs_flush_inode(
-        xfs_inode_t     *ip)
-{
-        struct inode    *inode = VFS_I(ip);
-        igrab(inode);
-        xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inode_work);
-        delay(msecs_to_jiffies(500));
-}
-/*
- * This is the "bigger hammer" version of xfs_flush_inode_work...
- * (IOW, "If at first you don't succeed, use a Bigger Hammer").
- */
-STATIC void
-xfs_flush_device_work(
        struct xfs_mount *mp,
        void            *arg)
 {
        struct inode    *inode = arg;
-        sync_blockdev(mp->m_super->s_bdev);
+        xfs_sync_inodes(mp, SYNC_DELWRI | SYNC_TRYLOCK);
+        xfs_sync_inodes(mp, SYNC_DELWRI | SYNC_TRYLOCK | SYNC_IOWAIT);
        iput(inode);
 }
 void
-xfs_flush_device(
+xfs_flush_inodes(
        xfs_inode_t     *ip)
 {
        struct inode    *inode = VFS_I(ip);
+        DECLARE_COMPLETION_ONSTACK(completion);
        igrab(inode);
-        xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_device_work);
+        xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion);
-        delay(msecs_to_jiffies(500));
+        wait_for_completion(&completion);
        xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC);
 }
@@ -497,7 +482,7 @@ xfssyncd(
 {
        struct xfs_mount        *mp = arg;
        long                    timeleft;
-        bhv_vfs_sync_work_t     *work, *n;
+        xfs_sync_work_t         *work, *n;
        LIST_HEAD               (tmp);
        set_freezable();
@@ -532,6 +517,8 @@ xfssyncd(
                        list_del(&work->w_list);
                        if (work == &mp->m_sync_work)
                                continue;
+                        if (work->w_completion)
+                                complete(work->w_completion);
                        kmem_free(work);
                }
        }
@@ -545,6 +532,7 @@ xfs_syncd_init(
 {
        mp->m_sync_work.w_syncer = xfs_sync_worker;
        mp->m_sync_work.w_mount = mp;
+        mp->m_sync_work.w_completion = NULL;
        mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd");
        if (IS_ERR(mp->m_sync_task))
                return -PTR_ERR(mp->m_sync_task);
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 04f058c848ae..308d5bf6dfbd 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -21,18 +21,20 @@
 struct xfs_mount;
 struct xfs_perag;
-typedef struct bhv_vfs_sync_work {
+typedef struct xfs_sync_work {
        struct list_head        w_list;
        struct xfs_mount        *w_mount;
        void                    *w_data;        /* syncer routine argument */
        void                    (*w_syncer)(struct xfs_mount *, void *);
-} bhv_vfs_sync_work_t;
+        struct completion       *w_completion;
+} xfs_sync_work_t;
 #define SYNC_ATTR               0x0001  /* sync attributes */
 #define SYNC_DELWRI             0x0002  /* look at delayed writes */
 #define SYNC_WAIT               0x0004  /* wait for i/o to complete */
 #define SYNC_BDFLUSH            0x0008  /* BDFLUSH is calling -- don't block */
 #define SYNC_IOWAIT             0x0010  /* wait for all I/O to complete */
+#define SYNC_TRYLOCK            0x0020  /* only try to lock inodes */
 int xfs_syncd_init(struct xfs_mount *mp);
 void xfs_syncd_stop(struct xfs_mount *mp);
@@ -43,8 +45,7 @@ int xfs_sync_fsdata(struct xfs_mount *mp, int flags);
 int xfs_quiesce_data(struct xfs_mount *mp);
 void xfs_quiesce_attr(struct xfs_mount *mp);
-void xfs_flush_inode(struct xfs_inode *ip);
+void xfs_flush_inodes(struct xfs_inode *ip);
-void xfs_flush_device(struct xfs_inode *ip);
 int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode);
 int xfs_reclaim_inodes(struct xfs_mount *mp, int noblock, int mode);
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 478e587087fe..89b81eedce6a 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -69,15 +69,6 @@ xfs_inode_alloc(
        ASSERT(!spin_is_locked(&ip->i_flags_lock));
        ASSERT(completion_done(&ip->i_flush));
-        /*
-         * initialise the VFS inode here to get failures
-         * out of the way early.
-         */
-        if (!inode_init_always(mp->m_super, VFS_I(ip))) {
-                kmem_zone_free(xfs_inode_zone, ip);
-                return NULL;
-        }
        /* initialise the xfs inode */
        ip->i_ino = ino;
        ip->i_mount = mp;
@@ -113,6 +104,20 @@ xfs_inode_alloc(
 #ifdef XFS_DIR2_TRACE
        ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS);
 #endif
+        /*
+        * Now initialise the VFS inode. We do this after the xfs_inode
+        * initialisation as internal failures will result in ->destroy_inode
+        * being called and that will pass down through the reclaim path and
+        * free the XFS inode. This path requires the XFS inode to already be
+        * initialised. Hence if this call fails, the xfs_inode has already
+        * been freed and we should not reference it at all in the error
+        * handling.
+        */
+        if (!inode_init_always(mp->m_super, VFS_I(ip)))
+                return NULL;
+        /* prevent anyone from using this yet */
+        VFS_I(ip)->i_state = I_NEW|I_LOCK;
        return ip;
 }
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 08ce72316bfe..5aaa2d7ec155 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -338,38 +338,6 @@ xfs_iomap_eof_align_last_fsb(
 }
 STATIC int
-xfs_flush_space(
-        xfs_inode_t     *ip,
-        int             *fsynced,
-        int             *ioflags)
-{
-        switch (*fsynced) {
-        case 0:
-                if (ip->i_delayed_blks) {
-                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                        xfs_flush_inode(ip);
-                        xfs_ilock(ip, XFS_ILOCK_EXCL);
-                        *fsynced = 1;
-                } else {
-                        *ioflags |= BMAPI_SYNC;
-                        *fsynced = 2;
-                }
-                return 0;
-        case 1:
-                *fsynced = 2;
-                *ioflags |= BMAPI_SYNC;
-                return 0;
-        case 2:
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                xfs_flush_device(ip);
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                *fsynced = 3;
-                return 0;
-        }
-        return 1;
-}
-STATIC int
 xfs_cmn_err_fsblock_zero(
        xfs_inode_t     *ip,
        xfs_bmbt_irec_t *imap)
@@ -538,15 +506,9 @@ error_out:
 }
 /*
- * If the caller is doing a write at the end of the file,
+ * If the caller is doing a write at the end of the file, then extend the
- * then extend the allocation out to the file system's write
+ * allocation out to the file system's write iosize.  We clean up any extra
- * iosize.  We clean up any extra space left over when the
+ * space left over when the file is closed in xfs_inactive().
- * file is closed in xfs_inactive().
- *
- * For sync writes, we are flushing delayed allocate space to
- * try to make additional space available for allocation near
- * the filesystem full boundary - preallocation hurts in that
- * situation, of course.
 */
 STATIC int
 xfs_iomap_eof_want_preallocate(
@@ -565,7 +527,7 @@ xfs_iomap_eof_want_preallocate(
        int             n, error, imaps;
        *prealloc = 0;
-        if ((ioflag & BMAPI_SYNC) || (offset + count) <= ip->i_size)
+        if ((offset + count) <= ip->i_size)
                return 0;
        /*
@@ -611,7 +573,7 @@ xfs_iomap_write_delay(
        xfs_extlen_t    extsz;
        int             nimaps;
        xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
-        int             prealloc, fsynced = 0;
+        int             prealloc, flushed = 0;
        int             error;
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -627,12 +589,12 @@ xfs_iomap_write_delay(
        extsz = xfs_get_extsz_hint(ip);
        offset_fsb = XFS_B_TO_FSBT(mp, offset);
-retry:
        error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
                                ioflag, imap, XFS_WRITE_IMAPS, &prealloc);
        if (error)
                return error;
+retry:
        if (prealloc) {
                aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
                ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
@@ -659,15 +621,22 @@ retry:
        /*
         * If bmapi returned us nothing, and if we didn't get back EDQUOT,
-         * then we must have run out of space - flush delalloc, and retry..
+         * then we must have run out of space - flush all other inodes with
+         * delalloc blocks and retry without EOF preallocation.
         */
        if (nimaps == 0) {
                xfs_iomap_enter_trace(XFS_IOMAP_WRITE_NOSPACE,
                                        ip, offset, count);
-                if (xfs_flush_space(ip, &fsynced, &ioflag))
+                if (flushed)
                        return XFS_ERROR(ENOSPC);
+                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                xfs_flush_inodes(ip);
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                flushed = 1;
                error = 0;
+                prealloc = 0;
                goto retry;
        }
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index a1cc1322fc0f..fdcf7b82747f 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -40,8 +40,7 @@ typedef enum {
        BMAPI_IGNSTATE = (1 << 4),      /* ignore unwritten state on read */
        BMAPI_DIRECT = (1 << 5),        /* direct instead of buffered write */
        BMAPI_MMAP = (1 << 6),          /* allocate for mmap write */
-        BMAPI_SYNC = (1 << 7),          /* sync write to flush delalloc space */
+        BMAPI_TRYLOCK = (1 << 7),       /* non-blocking request */
-        BMAPI_TRYLOCK = (1 << 8),       /* non-blocking request */
 } bmapi_flags_t;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index f76c6d7cea21..3750f04ede0b 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -562,9 +562,8 @@ xfs_log_mount(
        }
        mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks);
-        if (!mp->m_log) {
+        if (IS_ERR(mp->m_log)) {
-                cmn_err(CE_WARN, "XFS: Log allocation failed: No memory!");
+                error = -PTR_ERR(mp->m_log);
-                error = ENOMEM;
                goto out;
        }
@@ -1180,10 +1179,13 @@ xlog_alloc_log(xfs_mount_t	*mp,
        xfs_buf_t               *bp;
        int                     i;
        int                     iclogsize;
+        int                     error = ENOMEM;
        log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL);
-        if (!log)
+        if (!log) {
-                return NULL;
+                xlog_warn("XFS: Log allocation failed: No memory!");
+                goto out;
+        }
        log->l_mp          = mp;
        log->l_targ        = log_target;
@@ -1201,19 +1203,35 @@ xlog_alloc_log(xfs_mount_t	*mp,
        log->l_grant_reserve_cycle = 1;
        log->l_grant_write_cycle = 1;
+        error = EFSCORRUPTED;
        if (xfs_sb_version_hassector(&mp->m_sb)) {
                log->l_sectbb_log = mp->m_sb.sb_logsectlog - BBSHIFT;
-                ASSERT(log->l_sectbb_log <= mp->m_sectbb_log);
+                if (log->l_sectbb_log < 0 ||
+                    log->l_sectbb_log > mp->m_sectbb_log) {
+                        xlog_warn("XFS: Log sector size (0x%x) out of range.",
+                                                log->l_sectbb_log);
+                        goto out_free_log;
+                }
                /* for larger sector sizes, must have v2 or external log */
-                ASSERT(log->l_sectbb_log == 0 ||
+                if (log->l_sectbb_log != 0 &&
-                        log->l_logBBstart == 0 ||
+                    (log->l_logBBstart != 0 &&
-                        xfs_sb_version_haslogv2(&mp->m_sb));
+                     !xfs_sb_version_haslogv2(&mp->m_sb))) {
-                ASSERT(mp->m_sb.sb_logsectlog >= BBSHIFT);
+                        xlog_warn("XFS: log sector size (0x%x) invalid "
+                                  "for configuration.", log->l_sectbb_log);
+                        goto out_free_log;
+                }
+                if (mp->m_sb.sb_logsectlog < BBSHIFT) {
+                        xlog_warn("XFS: Log sector log (0x%x) too small.",
+                                                mp->m_sb.sb_logsectlog);
+                        goto out_free_log;
+                }
        }
        log->l_sectbb_mask = (1 << log->l_sectbb_log) - 1;
        xlog_get_iclog_buffer_size(mp, log);
+        error = ENOMEM;
        bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp);
        if (!bp)
                goto out_free_log;
@@ -1313,7 +1331,8 @@ out_free_iclog:
        xfs_buf_free(log->l_xbuf);
 out_free_log:
        kmem_free(log);
-        return NULL;
+out:
+        return ERR_PTR(-error);
 }       /* xlog_alloc_log */
@@ -2541,18 +2560,19 @@ redo:
                        xlog_ins_ticketq(&log->l_reserve_headq, tic);
                xlog_trace_loggrant(log, tic,
                                    "xlog_grant_log_space: sleep 2");
+                spin_unlock(&log->l_grant_lock);
+                xlog_grant_push_ail(log->l_mp, need_bytes);
+                spin_lock(&log->l_grant_lock);
                XFS_STATS_INC(xs_sleep_logspace);
                sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
-                if (XLOG_FORCED_SHUTDOWN(log)) {
+                spin_lock(&log->l_grant_lock);
-                        spin_lock(&log->l_grant_lock);
+                if (XLOG_FORCED_SHUTDOWN(log))
                        goto error_return;
-                }
                xlog_trace_loggrant(log, tic,
                                    "xlog_grant_log_space: wake 2");
-                xlog_grant_push_ail(log->l_mp, need_bytes);
-                spin_lock(&log->l_grant_lock);
                goto redo;
        } else if (tic->t_flags & XLOG_TIC_IN_Q)
                xlog_del_ticketq(&log->l_reserve_headq, tic);
@@ -2631,7 +2651,7 @@ xlog_regrant_write_log_space(xlog_t	   *log,
         * for more free space, otherwise try to get some space for
         * this transaction.
         */
+        need_bytes = tic->t_unit_res;
        if ((ntic = log->l_write_headq)) {
                free_bytes = xlog_space_left(log, log->l_grant_write_cycle,
                                             log->l_grant_write_bytes);
@@ -2651,26 +2671,25 @@ xlog_regrant_write_log_space(xlog_t	   *log,
                        xlog_trace_loggrant(log, tic,
                                    "xlog_regrant_write_log_space: sleep 1");
+                        spin_unlock(&log->l_grant_lock);
+                        xlog_grant_push_ail(log->l_mp, need_bytes);
+                        spin_lock(&log->l_grant_lock);
                        XFS_STATS_INC(xs_sleep_logspace);
                        sv_wait(&tic->t_wait, PINOD|PLTWAIT,
                                &log->l_grant_lock, s);
                        /* If we're shutting down, this tic is already
                         * off the queue */
-                        if (XLOG_FORCED_SHUTDOWN(log)) {
+                        spin_lock(&log->l_grant_lock);
-                                spin_lock(&log->l_grant_lock);
+                        if (XLOG_FORCED_SHUTDOWN(log))
                                goto error_return;
-                        }
                        xlog_trace_loggrant(log, tic,
                                    "xlog_regrant_write_log_space: wake 1");
-                        xlog_grant_push_ail(log->l_mp, tic->t_unit_res);
-                        spin_lock(&log->l_grant_lock);
                }
        }
-        need_bytes = tic->t_unit_res;
 redo:
        if (XLOG_FORCED_SHUTDOWN(log))
                goto error_return;
@@ -2680,19 +2699,20 @@ redo:
        if (free_bytes < need_bytes) {
                if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
                        xlog_ins_ticketq(&log->l_write_headq, tic);
+                spin_unlock(&log->l_grant_lock);
+                xlog_grant_push_ail(log->l_mp, need_bytes);
+                spin_lock(&log->l_grant_lock);
                XFS_STATS_INC(xs_sleep_logspace);
                sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
                /* If we're shutting down, this tic is already off the queue */
-                if (XLOG_FORCED_SHUTDOWN(log)) {
+                spin_lock(&log->l_grant_lock);
-                        spin_lock(&log->l_grant_lock);
+                if (XLOG_FORCED_SHUTDOWN(log))
                        goto error_return;
-                }
                xlog_trace_loggrant(log, tic,
                                    "xlog_regrant_write_log_space: wake 2");
-                xlog_grant_push_ail(log->l_mp, need_bytes);
-                spin_lock(&log->l_grant_lock);
                goto redo;
        } else if (tic->t_flags & XLOG_TIC_IN_Q)
                xlog_del_ticketq(&log->l_write_headq, tic);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 7af44adffc8f..d6a64392f983 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -313,7 +313,7 @@ typedef struct xfs_mount {
 #endif
        struct xfs_mru_cache    *m_filestream;  /* per-mount filestream data */
        struct task_struct      *m_sync_task;   /* generalised sync thread */
-        bhv_vfs_sync_work_t     m_sync_work;    /* work item for VFS_SYNC */
+        xfs_sync_work_t         m_sync_work;    /* work item for VFS_SYNC */
        struct list_head        m_sync_list;    /* sync thread work item list */
        spinlock_t              m_sync_lock;    /* work item list lock */
        int                     m_sync_seq;     /* sync thread generation no. */
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 7394c7af5de5..19cf90a9c762 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1457,6 +1457,13 @@ xfs_create(
        error = xfs_trans_reserve(tp, resblks, log_res, 0,
                        XFS_TRANS_PERM_LOG_RES, log_count);
        if (error == ENOSPC) {
+                /* flush outstanding delalloc blocks and retry */
+                xfs_flush_inodes(dp);
+                error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0,
+                        XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
+        }
+        if (error == ENOSPC) {
+                /* No space at all so try a "no-allocation" reservation */
                resblks = 0;
                error = xfs_trans_reserve(tp, 0, log_res, 0,
                                XFS_TRANS_PERM_LOG_RES, log_count);