Merge branch 'linus' into perfcounters/core

Merge reason: Bring in tracing changes we depend on. Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Ingo Molnar <mingo@elte.hu> 2009-09-19 05:27:32 -0400
committer: Ingo Molnar <mingo@elte.hu> 2009-09-19 05:28:41 -0400
commit: 929bf0d0156562ce631728b6fa53d68004d456d2 (patch)
tree: 739063990a8077b29ef97e69d73bce94573daae4 /fs
parent: def0a9b2573e00ab0b486cb5382625203ab4c4a6 (diff)
parent: 202c4675c55ddf6b443c7e057d2dff6b42ef71aa (diff)
124 files changed, 2295 insertions, 3330 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 0e7da7bb5d93..455aa207e67e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -43,6 +43,7 @@ source "fs/xfs/Kconfig"
 source "fs/gfs2/Kconfig"
 source "fs/ocfs2/Kconfig"
 source "fs/btrfs/Kconfig"
+source "fs/nilfs2/Kconfig"
 endif # BLOCK
@@ -186,7 +187,6 @@ source "fs/romfs/Kconfig"
 source "fs/sysv/Kconfig"
 source "fs/ufs/Kconfig"
 source "fs/exofs/Kconfig"
-source "fs/nilfs2/Kconfig"
 endif # MISC_FILESYSTEMS
diff --git a/fs/afs/write.c b/fs/afs/write.c
index c2e7a7ff0080..c63a3c8beb73 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -712,7 +712,6 @@ int afs_writeback_all(struct afs_vnode *vnode)
                .bdi            = mapping->backing_dev_info,
                .sync_mode      = WB_SYNC_ALL,
                .nr_to_write    = LONG_MAX,
-                .for_writepages = 1,
                .range_cyclic   = 1,
        };
        int ret;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 94dfda24c06e..71e7e03ac343 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -420,7 +420,6 @@ static void bdev_destroy_inode(struct inode *inode)
 {
        struct bdev_inode *bdi = BDEV_I(inode);
-        bdi->bdev.bd_inode_backing_dev_info = NULL;
        kmem_cache_free(bdev_cachep, bdi);
 }
@@ -1405,6 +1404,33 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 }
 /*
+ * Write data to the block device.  Only intended for the block device itself
+ * and the raw driver which basically is a fake block device.
+ *
+ * Does not take i_mutex for the write and thus is not for general purpose
+ * use.
+ */
+ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
+                         unsigned long nr_segs, loff_t pos)
+{
+        struct file *file = iocb->ki_filp;
+        ssize_t ret;
+        BUG_ON(iocb->ki_pos != pos);
+        ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
+        if (ret > 0 || ret == -EIOCBQUEUED) {
+                ssize_t err;
+                err = generic_write_sync(file, pos, ret);
+                if (err < 0 && ret > 0)
+                        ret = err;
+        }
+        return ret;
+}
+EXPORT_SYMBOL_GPL(blkdev_aio_write);
+/*
 * Try to release a page associated with block device when the system
 * is under memory pressure.
 */
@@ -1436,7 +1462,7 @@ const struct file_operations def_blk_fops = {
        .read           = do_sync_read,
        .write          = do_sync_write,
        .aio_read       = generic_file_aio_read,
-        .aio_write      = generic_file_aio_write_nolock,
+        .aio_write      = blkdev_aio_write,
        .mmap           = generic_file_mmap,
        .fsync          = block_fsync,
        .unlocked_ioctl = block_ioctl,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 15831d5c7367..8b8192790011 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1600,6 +1600,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        sb->s_blocksize = 4096;
        sb->s_blocksize_bits = blksize_bits(4096);
+        sb->s_bdi = &fs_info->bdi;
        /*
         * we set the i_size on the btree inode to the max possible int.
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 72a2b9c28e9f..535f85ba104f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1511,7 +1511,8 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
 static void btrfs_issue_discard(struct block_device *bdev,
                                u64 start, u64 len)
 {
-        blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
+        blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
+                             DISCARD_FL_BARRIER);
 }
 #endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index d6f0806c682f..7b2f401e604e 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -740,7 +740,6 @@ int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
                .nr_to_write = mapping->nrpages * 2,
                .range_start = start,
                .range_end = end,
-                .for_writepages = 1,
        };
        return btrfs_writepages(mapping, &wbc);
 }
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5dbefd11b4af..5cf405b0828d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -260,7 +260,7 @@ loop_lock:
                num_run++;
                batch_run++;
-                if (bio_sync(cur))
+                if (bio_rw_flagged(cur, BIO_RW_SYNCIO))
                        num_sync_run++;
                if (need_resched()) {
@@ -2903,7 +2903,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
        bio->bi_rw |= rw;
        spin_lock(&device->io_lock);
-        if (bio_sync(bio))
+        if (bio_rw_flagged(bio, BIO_RW_SYNCIO))
                pending_bios = &device->pending_sync_bios;
        else
                pending_bios = &device->pending_bios;
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index e85b1e4389e0..145540a316ab 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -3,7 +3,10 @@ Version 1.60
 Fix memory leak in reconnect.  Fix oops in DFS mount error path.
 Set s_maxbytes to smaller (the max that vfs can handle) so that
 sendfile will now work over cifs mounts again.  Add noforcegid
-and noforceuid mount parameters.
+and noforceuid mount parameters. Fix small mem leak when using
+ntlmv2. Fix 2nd mount to same server but with different port to
+be allowed (rather than reusing the 1st port) - only when the
+user explicitly overrides the port on the 2nd mount.
 Version 1.59
 ------------
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 051caecf7d67..8ec7736ce954 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -125,7 +125,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
        if (server->addr.sockAddr.sin_family == AF_INET)
                sprintf(dp, "ip4=%pI4", &server->addr.sockAddr.sin_addr);
        else if (server->addr.sockAddr.sin_family == AF_INET6)
-                sprintf(dp, "ip6=%pi6", &server->addr.sockAddr6.sin6_addr);
+                sprintf(dp, "ip6=%pI6", &server->addr.sockAddr6.sin6_addr);
        else
                goto out;
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 6941c22398a6..7dfe0842a6f6 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -607,7 +607,7 @@ static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
                return get_cifs_acl_by_path(cifs_sb, path, pacllen);
        pntsd = get_cifs_acl_by_fid(cifs_sb, open_file->netfid, pacllen);
-        atomic_dec(&open_file->wrtPending);
+        cifsFileInfo_put(open_file);
        return pntsd;
 }
@@ -665,7 +665,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
                return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen);
        rc = set_cifs_acl_by_fid(cifs_sb, open_file->netfid, pnntsd, acllen);
-        atomic_dec(&open_file->wrtPending);
+        cifsFileInfo_put(open_file);
        return rc;
 }
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 7c9809523f42..7efe1745494d 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -373,6 +373,7 @@ calc_exit_2:
           compare with the NTLM example */
        hmac_md5_final(ses->server->ntlmv2_hash, pctxt);
+        kfree(pctxt);
        return rc;
 }
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 84b75253b05a..3610e9958b4c 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -361,13 +361,10 @@ cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
 static int
 cifs_show_options(struct seq_file *s, struct vfsmount *m)
 {
-        struct cifs_sb_info *cifs_sb;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(m->mnt_sb);
-        struct cifsTconInfo *tcon;
+        struct cifsTconInfo *tcon = cifs_sb->tcon;
-        cifs_sb = CIFS_SB(m->mnt_sb);
-        tcon = cifs_sb->tcon;
-        seq_printf(s, ",unc=%s", cifs_sb->tcon->treeName);
+        seq_printf(s, ",unc=%s", tcon->treeName);
        if (tcon->ses->userName)
                seq_printf(s, ",username=%s", tcon->ses->userName);
        if (tcon->ses->domainName)
@@ -989,19 +986,19 @@ static int cifs_oplock_thread(void *dummyarg)
                if (try_to_freeze())
                        continue;
-                spin_lock(&GlobalMid_Lock);
+                spin_lock(&cifs_oplock_lock);
-                if (list_empty(&GlobalOplock_Q)) {
+                if (list_empty(&cifs_oplock_list)) {
-                        spin_unlock(&GlobalMid_Lock);
+                        spin_unlock(&cifs_oplock_lock);
                        set_current_state(TASK_INTERRUPTIBLE);
                        schedule_timeout(39*HZ);
                } else {
-                        oplock_item = list_entry(GlobalOplock_Q.next,
+                        oplock_item = list_entry(cifs_oplock_list.next,
                                                struct oplock_q_entry, qhead);
                        cFYI(1, ("found oplock item to write out"));
                        pTcon = oplock_item->tcon;
                        inode = oplock_item->pinode;
                        netfid = oplock_item->netfid;
-                        spin_unlock(&GlobalMid_Lock);
+                        spin_unlock(&cifs_oplock_lock);
                        DeleteOplockQEntry(oplock_item);
                        /* can not grab inode sem here since it would
                                deadlock when oplock received on delete
@@ -1058,7 +1055,7 @@ init_cifs(void)
        int rc = 0;
        cifs_proc_init();
        INIT_LIST_HEAD(&cifs_tcp_ses_list);
-        INIT_LIST_HEAD(&GlobalOplock_Q);
+        INIT_LIST_HEAD(&cifs_oplock_list);
 #ifdef CONFIG_CIFS_EXPERIMENTAL
        INIT_LIST_HEAD(&GlobalDnotifyReqList);
        INIT_LIST_HEAD(&GlobalDnotifyRsp_Q);
@@ -1087,6 +1084,7 @@ init_cifs(void)
        rwlock_init(&GlobalSMBSeslock);
        rwlock_init(&cifs_tcp_ses_lock);
        spin_lock_init(&GlobalMid_Lock);
+        spin_lock_init(&cifs_oplock_lock);
        if (cifs_max_pending < 2) {
                cifs_max_pending = 2;
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 6c170948300d..094325e3f714 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -113,5 +113,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
-#define CIFS_VERSION   "1.60"
+#define CIFS_VERSION   "1.61"
 #endif                          /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 6084d6379c03..6cfc81a32703 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -351,11 +351,24 @@ struct cifsFileInfo {
        bool closePend:1;       /* file is marked to close */
        bool invalidHandle:1;   /* file closed via session abend */
        bool messageMode:1;     /* for pipes: message vs byte mode */
-        atomic_t wrtPending;   /* handle in use - defer close */
+        atomic_t count;         /* reference count */
        struct mutex fh_mutex; /* prevents reopen race after dead ses*/
        struct cifs_search_info srch_inf;
 };
+/* Take a reference on the file private data */
+static inline void cifsFileInfo_get(struct cifsFileInfo *cifs_file)
+{
+        atomic_inc(&cifs_file->count);
+}
+/* Release a reference on the file private data */
+static inline void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
+{
+        if (atomic_dec_and_test(&cifs_file->count))
+                kfree(cifs_file);
+}
 /*
 * One of these for each file inode
 */
@@ -656,7 +669,11 @@ GLOBAL_EXTERN rwlock_t		cifs_tcp_ses_lock;
 */
 GLOBAL_EXTERN rwlock_t GlobalSMBSeslock;
-GLOBAL_EXTERN struct list_head GlobalOplock_Q;
+/* Global list of oplocks */
+GLOBAL_EXTERN struct list_head cifs_oplock_list;
+/* Protects the cifs_oplock_list */
+GLOBAL_EXTERN spinlock_t cifs_oplock_lock;
 /* Outstanding dir notify requests */
 GLOBAL_EXTERN struct list_head GlobalDnotifyReqList;
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 1866bc2927d4..301e307e1279 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -100,110 +100,138 @@ static void mark_open_files_invalid(struct cifsTconInfo *pTcon)
           to this tcon */
 }
-/* Allocate and return pointer to an SMB request buffer, and set basic
+/* reconnect the socket, tcon, and smb session if needed */
-   SMB information in the SMB header.  If the return code is zero, this
-   function must have filled in request_buf pointer */
 static int
-small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
+cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
-                void **request_buf)
 {
        int rc = 0;
+        struct cifsSesInfo *ses;
+        struct TCP_Server_Info *server;
+        struct nls_table *nls_codepage;
-        /* SMBs NegProt, SessSetup, uLogoff do not have tcon yet so
+        /*
-           check for tcp and smb session status done differently
+         * SMBs NegProt, SessSetup, uLogoff do not have tcon yet so check for
-           for those three - in the calling routine */
+         * tcp and smb session status done differently for those three - in the
-        if (tcon) {
+         * calling routine
-                if (tcon->tidStatus == CifsExiting) {
+         */
-                        /* only tree disconnect, open, and write,
+        if (!tcon)
-                        (and ulogoff which does not have tcon)
+                return 0;
-                        are allowed as we start force umount */
-                        if ((smb_command != SMB_COM_WRITE_ANDX) &&
+        ses = tcon->ses;
-                           (smb_command != SMB_COM_OPEN_ANDX) &&
+        server = ses->server;
-                           (smb_command != SMB_COM_TREE_DISCONNECT)) {
-                                cFYI(1, ("can not send cmd %d while umounting",
+        /*
-                                        smb_command));
+         * only tree disconnect, open, and write, (and ulogoff which does not
-                                return -ENODEV;
+         * have tcon) are allowed as we start force umount
-                        }
+         */
+        if (tcon->tidStatus == CifsExiting) {
+                if (smb_command != SMB_COM_WRITE_ANDX &&
+                    smb_command != SMB_COM_OPEN_ANDX &&
+                    smb_command != SMB_COM_TREE_DISCONNECT) {
+                        cFYI(1, ("can not send cmd %d while umounting",
+                                smb_command));
+                        return -ENODEV;
                }
-                if ((tcon->ses) && (tcon->ses->status != CifsExiting) &&
+        }
-                                  (tcon->ses->server)) {
-                        struct nls_table *nls_codepage;
-                                /* Give Demultiplex thread up to 10 seconds to
-                                   reconnect, should be greater than cifs socket
-                                   timeout which is 7 seconds */
-                        while (tcon->ses->server->tcpStatus ==
-                                                         CifsNeedReconnect) {
-                                wait_event_interruptible_timeout(tcon->ses->server->response_q,
-                                        (tcon->ses->server->tcpStatus ==
-                                                        CifsGood), 10 * HZ);
-                                if (tcon->ses->server->tcpStatus ==
-                                                        CifsNeedReconnect) {
-                                        /* on "soft" mounts we wait once */
-                                        if (!tcon->retry ||
-                                           (tcon->ses->status == CifsExiting)) {
-                                                cFYI(1, ("gave up waiting on "
-                                                      "reconnect in smb_init"));
-                                                return -EHOSTDOWN;
-                                        } /* else "hard" mount - keep retrying
-                                             until process is killed or server
-                                             comes back on-line */
-                                } else /* TCP session is reestablished now */
-                                        break;
-                        }
-                        nls_codepage = load_nls_default();
+        if (ses->status == CifsExiting)
-                /* need to prevent multiple threads trying to
+                return -EIO;
-                simultaneously reconnect the same SMB session */
-                        down(&tcon->ses->sesSem);
-                        if (tcon->ses->need_reconnect)
-                                rc = cifs_setup_session(0, tcon->ses,
-                                                        nls_codepage);
-                        if (!rc && (tcon->need_reconnect)) {
-                                mark_open_files_invalid(tcon);
-                                rc = CIFSTCon(0, tcon->ses, tcon->treeName,
-                                              tcon, nls_codepage);
-                                up(&tcon->ses->sesSem);
-                                /* BB FIXME add code to check if wsize needs
-                                   update due to negotiated smb buffer size
-                                   shrinking */
-                                if (rc == 0) {
-                                        atomic_inc(&tconInfoReconnectCount);
-                                        /* tell server Unix caps we support */
-                                        if (tcon->ses->capabilities & CAP_UNIX)
-                                                reset_cifs_unix_caps(
-                                                0 /* no xid */,
-                                                tcon,
-                                                NULL /* we do not know sb */,
-                                                NULL /* no vol info */);
-                                }
-                                cFYI(1, ("reconnect tcon rc = %d", rc));
+        /*
-                                /* Removed call to reopen open files here.
+         * Give demultiplex thread up to 10 seconds to reconnect, should be
-                                   It is safer (and faster) to reopen files
+         * greater than cifs socket timeout which is 7 seconds
-                                   one at a time as needed in read and write */
+         */
+        while (server->tcpStatus == CifsNeedReconnect) {
-                                /* Check if handle based operation so we
+                wait_event_interruptible_timeout(server->response_q,
-                                   know whether we can continue or not without
+                        (server->tcpStatus == CifsGood), 10 * HZ);
-                                   returning to caller to reset file handle */
-                                switch (smb_command) {
-                                        case SMB_COM_READ_ANDX:
-                                        case SMB_COM_WRITE_ANDX:
-                                        case SMB_COM_CLOSE:
-                                        case SMB_COM_FIND_CLOSE2:
-                                        case SMB_COM_LOCKING_ANDX: {
-                                                unload_nls(nls_codepage);
-                                                return -EAGAIN;
-                                        }
-                                }
-                        } else {
-                                up(&tcon->ses->sesSem);
-                        }
-                        unload_nls(nls_codepage);
-                } else {
+                /* is TCP session is reestablished now ?*/
-                        return -EIO;
+                if (server->tcpStatus != CifsNeedReconnect)
+                        break;
+                /*
+                 * on "soft" mounts we wait once. Hard mounts keep
+                 * retrying until process is killed or server comes
+                 * back on-line
+                 */
+                if (!tcon->retry || ses->status == CifsExiting) {
+                        cFYI(1, ("gave up waiting on reconnect in smb_init"));
+                        return -EHOSTDOWN;
                }
        }
+        if (!ses->need_reconnect && !tcon->need_reconnect)
+                return 0;
+        nls_codepage = load_nls_default();
+        /*
+         * need to prevent multiple threads trying to simultaneously
+         * reconnect the same SMB session
+         */
+        down(&ses->sesSem);
+        if (ses->need_reconnect)
+                rc = cifs_setup_session(0, ses, nls_codepage);
+        /* do we need to reconnect tcon? */
+        if (rc || !tcon->need_reconnect) {
+                up(&ses->sesSem);
+                goto out;
+        }
+        mark_open_files_invalid(tcon);
+        rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage);
+        up(&ses->sesSem);
+        cFYI(1, ("reconnect tcon rc = %d", rc));
+        if (rc)
+                goto out;
+        /*
+         * FIXME: check if wsize needs updated due to negotiated smb buffer
+         *        size shrinking
+         */
+        atomic_inc(&tconInfoReconnectCount);
+        /* tell server Unix caps we support */
+        if (ses->capabilities & CAP_UNIX)
+                reset_cifs_unix_caps(0, tcon, NULL, NULL);
+        /*
+         * Removed call to reopen open files here. It is safer (and faster) to
+         * reopen files one at a time as needed in read and write.
+         *
+         * FIXME: what about file locks? don't we need to reclaim them ASAP?
+         */
+out:
+        /*
+         * Check if handle based operation so we know whether we can continue
+         * or not without returning to caller to reset file handle
+         */
+        switch (smb_command) {
+        case SMB_COM_READ_ANDX:
+        case SMB_COM_WRITE_ANDX:
+        case SMB_COM_CLOSE:
+        case SMB_COM_FIND_CLOSE2:
+        case SMB_COM_LOCKING_ANDX:
+                rc = -EAGAIN;
+        }
+        unload_nls(nls_codepage);
+        return rc;
+}
+/* Allocate and return pointer to an SMB request buffer, and set basic
+   SMB information in the SMB header.  If the return code is zero, this
+   function must have filled in request_buf pointer */
+static int
+small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
+                void **request_buf)
+{
+        int rc = 0;
+        rc = cifs_reconnect_tcon(tcon, smb_command);
        if (rc)
                return rc;
@@ -256,101 +284,7 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
 {
        int rc = 0;
-        /* SMBs NegProt, SessSetup, uLogoff do not have tcon yet so
+        rc = cifs_reconnect_tcon(tcon, smb_command);
-           check for tcp and smb session status done differently
-           for those three - in the calling routine */
-        if (tcon) {
-                if (tcon->tidStatus == CifsExiting) {
-                        /* only tree disconnect, open, and write,
-                          (and ulogoff which does not have tcon)
-                          are allowed as we start force umount */
-                        if ((smb_command != SMB_COM_WRITE_ANDX) &&
-                           (smb_command != SMB_COM_OPEN_ANDX) &&
-                           (smb_command != SMB_COM_TREE_DISCONNECT)) {
-                                cFYI(1, ("can not send cmd %d while umounting",
-                                        smb_command));
-                                return -ENODEV;
-                        }
-                }
-                if ((tcon->ses) && (tcon->ses->status != CifsExiting) &&
-                                  (tcon->ses->server)) {
-                        struct nls_table *nls_codepage;
-                                /* Give Demultiplex thread up to 10 seconds to
-                                   reconnect, should be greater than cifs socket
-                                   timeout which is 7 seconds */
-                        while (tcon->ses->server->tcpStatus ==
-                                                        CifsNeedReconnect) {
-                                wait_event_interruptible_timeout(tcon->ses->server->response_q,
-                                        (tcon->ses->server->tcpStatus ==
-                                                        CifsGood), 10 * HZ);
-                                if (tcon->ses->server->tcpStatus ==
-                                                CifsNeedReconnect) {
-                                        /* on "soft" mounts we wait once */
-                                        if (!tcon->retry ||
-                                           (tcon->ses->status == CifsExiting)) {
-                                                cFYI(1, ("gave up waiting on "
-                                                      "reconnect in smb_init"));
-                                                return -EHOSTDOWN;
-                                        } /* else "hard" mount - keep retrying
-                                             until process is killed or server
-                                             comes on-line */
-                                } else /* TCP session is reestablished now */
-                                        break;
-                        }
-                        nls_codepage = load_nls_default();
-                /* need to prevent multiple threads trying to
-                simultaneously reconnect the same SMB session */
-                        down(&tcon->ses->sesSem);
-                        if (tcon->ses->need_reconnect)
-                                rc = cifs_setup_session(0, tcon->ses,
-                                                        nls_codepage);
-                        if (!rc && (tcon->need_reconnect)) {
-                                mark_open_files_invalid(tcon);
-                                rc = CIFSTCon(0, tcon->ses, tcon->treeName,
-                                              tcon, nls_codepage);
-                                up(&tcon->ses->sesSem);
-                                /* BB FIXME add code to check if wsize needs
-                                update due to negotiated smb buffer size
-                                shrinking */
-                                if (rc == 0) {
-                                        atomic_inc(&tconInfoReconnectCount);
-                                        /* tell server Unix caps we support */
-                                        if (tcon->ses->capabilities & CAP_UNIX)
-                                                reset_cifs_unix_caps(
-                                                0 /* no xid */,
-                                                tcon,
-                                                NULL /* do not know sb */,
-                                                NULL /* no vol info */);
-                                }
-                                cFYI(1, ("reconnect tcon rc = %d", rc));
-                                /* Removed call to reopen open files here.
-                                   It is safer (and faster) to reopen files
-                                   one at a time as needed in read and write */
-                                /* Check if handle based operation so we
-                                   know whether we can continue or not without
-                                   returning to caller to reset file handle */
-                                switch (smb_command) {
-                                        case SMB_COM_READ_ANDX:
-                                        case SMB_COM_WRITE_ANDX:
-                                        case SMB_COM_CLOSE:
-                                        case SMB_COM_FIND_CLOSE2:
-                                        case SMB_COM_LOCKING_ANDX: {
-                                                unload_nls(nls_codepage);
-                                                return -EAGAIN;
-                                        }
-                                }
-                        } else {
-                                up(&tcon->ses->sesSem);
-                        }
-                        unload_nls(nls_codepage);
-                } else {
-                        return -EIO;
-                }
-        }
        if (rc)
                return rc;
@@ -3961,6 +3895,10 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
                if (is_unicode) {
                        __le16 *tmp = kmalloc(strlen(searchName)*2 + 2,
                                                GFP_KERNEL);
+                        if (tmp == NULL) {
+                                rc = -ENOMEM;
+                                goto parse_DFS_referrals_exit;
+                        }
                        cifsConvertToUCS((__le16 *) tmp, searchName,
                                        PATH_MAX, nls_codepage, remap);
                        node->path_consumed = cifs_ucs2_bytes(tmp,
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 1f3345d7fa79..d49682433c20 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1377,7 +1377,7 @@ cifs_parse_mount_options(char *options, const char *devname,
 }
 static struct TCP_Server_Info *
-cifs_find_tcp_session(struct sockaddr_storage *addr)
+cifs_find_tcp_session(struct sockaddr_storage *addr, unsigned short int port)
 {
        struct list_head *tmp;
        struct TCP_Server_Info *server;
@@ -1397,16 +1397,37 @@ cifs_find_tcp_session(struct sockaddr_storage *addr)
                if (server->tcpStatus == CifsNew)
                        continue;
-                if (addr->ss_family == AF_INET &&
+                switch (addr->ss_family) {
-                    (addr4->sin_addr.s_addr !=
+                case AF_INET:
-                     server->addr.sockAddr.sin_addr.s_addr))
+                        if (addr4->sin_addr.s_addr ==
-                        continue;
+                            server->addr.sockAddr.sin_addr.s_addr) {
-                else if (addr->ss_family == AF_INET6 &&
+                                addr4->sin_port = htons(port);
-                         (!ipv6_addr_equal(&server->addr.sockAddr6.sin6_addr,
+                                /* user overrode default port? */
-                                           &addr6->sin6_addr) ||
+                                if (addr4->sin_port) {
-                          server->addr.sockAddr6.sin6_scope_id !=
+                                        if (addr4->sin_port !=
-                                           addr6->sin6_scope_id))
+                                            server->addr.sockAddr.sin_port)
-                        continue;
+                                                continue;
+                                }
+                                break;
+                        } else
+                                continue;
+                case AF_INET6:
+                        if (ipv6_addr_equal(&addr6->sin6_addr,
+                            &server->addr.sockAddr6.sin6_addr) &&
+                            (addr6->sin6_scope_id ==
+                            server->addr.sockAddr6.sin6_scope_id)) {
+                                addr6->sin6_port = htons(port);
+                                /* user overrode default port? */
+                                if (addr6->sin6_port) {
+                                        if (addr6->sin6_port !=
+                                           server->addr.sockAddr6.sin6_port)
+                                                continue;
+                                }
+                                break;
+                        } else
+                                continue;
+                }
                ++server->srv_count;
                write_unlock(&cifs_tcp_ses_lock);
@@ -1475,7 +1496,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
        }
        /* see if we already have a matching tcp_ses */
-        tcp_ses = cifs_find_tcp_session(&addr);
+        tcp_ses = cifs_find_tcp_session(&addr, volume_info->port);
        if (tcp_ses)
                return tcp_ses;
@@ -2636,9 +2657,9 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
                return -EIO;
        smb_buffer = cifs_buf_get();
-        if (smb_buffer == NULL) {
+        if (smb_buffer == NULL)
                return -ENOMEM;
-        }
        smb_buffer_response = smb_buffer;
        header_assemble(smb_buffer, SMB_COM_TREE_CONNECT_ANDX,
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 4326ffd90fa9..a6424cfc0121 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -153,7 +153,7 @@ cifs_fill_fileinfo(struct inode *newinode, __u16 fileHandle,
        mutex_init(&pCifsFile->fh_mutex);
        mutex_init(&pCifsFile->lock_mutex);
        INIT_LIST_HEAD(&pCifsFile->llist);
-        atomic_set(&pCifsFile->wrtPending, 0);
+        atomic_set(&pCifsFile->count, 1);
        /* set the following in open now
                        pCifsFile->pfile = file; */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index c34b7f8a217b..fa7beac8b80e 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -53,11 +53,9 @@ static inline struct cifsFileInfo *cifs_init_private(
        private_data->pInode = inode;
        private_data->invalidHandle = false;
        private_data->closePend = false;
-        /* we have to track num writers to the inode, since writepages
+        /* Initialize reference count to one.  The private data is
-        does not tell us which handle the write is for so there can
+        freed on the release of the last reference */
-        be a close (overlapping with write) of the filehandle that
+        atomic_set(&private_data->count, 1);
-        cifs_writepages chose to use */
-        atomic_set(&private_data->wrtPending, 0);
        return private_data;
 }
@@ -643,7 +641,7 @@ int cifs_close(struct inode *inode, struct file *file)
                        if (!pTcon->need_reconnect) {
                                write_unlock(&GlobalSMBSeslock);
                                timeout = 2;
-                                while ((atomic_read(&pSMBFile->wrtPending) != 0)
+                                while ((atomic_read(&pSMBFile->count) != 1)
                                        && (timeout <= 2048)) {
                                        /* Give write a better chance to get to
                                        server ahead of the close.  We do not
@@ -657,8 +655,6 @@ int cifs_close(struct inode *inode, struct file *file)
                                        msleep(timeout);
                                        timeout *= 4;
                                }
-                                if (atomic_read(&pSMBFile->wrtPending))
-                                        cERROR(1, ("close with pending write"));
                                if (!pTcon->need_reconnect &&
                                    !pSMBFile->invalidHandle)
                                        rc = CIFSSMBClose(xid, pTcon,
@@ -681,24 +677,7 @@ int cifs_close(struct inode *inode, struct file *file)
                list_del(&pSMBFile->flist);
                list_del(&pSMBFile->tlist);
                write_unlock(&GlobalSMBSeslock);
-                timeout = 10;
+                cifsFileInfo_put(file->private_data);
-                /* We waited above to give the SMBWrite a chance to issue
-                   on the wire (so we do not get SMBWrite returning EBADF
-                   if writepages is racing with close.  Note that writepages
-                   does not specify a file handle, so it is possible for a file
-                   to be opened twice, and the application close the "wrong"
-                   file handle - in these cases we delay long enough to allow
-                   the SMBWrite to get on the wire before the SMB Close.
-                   We allow total wait here over 45 seconds, more than
-                   oplock break time, and more than enough to allow any write
-                   to complete on the server, or to time out on the client */
-                while ((atomic_read(&pSMBFile->wrtPending) != 0)
-                                && (timeout <= 50000)) {
-                        cERROR(1, ("writes pending, delay free of handle"));
-                        msleep(timeout);
-                        timeout *= 8;
-                }
-                kfree(file->private_data);
                file->private_data = NULL;
        } else
                rc = -EBADF;
@@ -1236,7 +1215,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode)
                        if (!open_file->invalidHandle) {
                                /* found a good file */
                                /* lock it so it will not be closed on us */
-                                atomic_inc(&open_file->wrtPending);
+                                cifsFileInfo_get(open_file);
                                read_unlock(&GlobalSMBSeslock);
                                return open_file;
                        } /* else might as well continue, and look for
@@ -1276,7 +1255,7 @@ refind_writable:
                if (open_file->pfile &&
                    ((open_file->pfile->f_flags & O_RDWR) ||
                     (open_file->pfile->f_flags & O_WRONLY))) {
-                        atomic_inc(&open_file->wrtPending);
+                        cifsFileInfo_get(open_file);
                        if (!open_file->invalidHandle) {
                                /* found a good writable file */
@@ -1293,7 +1272,7 @@ refind_writable:
                                else { /* start over in case this was deleted */
                                       /* since the list could be modified */
                                        read_lock(&GlobalSMBSeslock);
-                                        atomic_dec(&open_file->wrtPending);
+                                        cifsFileInfo_put(open_file);
                                        goto refind_writable;
                                }
                        }
@@ -1309,7 +1288,7 @@ refind_writable:
                        read_lock(&GlobalSMBSeslock);
                        /* can not use this handle, no write
                           pending on this one after all */
-                        atomic_dec(&open_file->wrtPending);
+                        cifsFileInfo_put(open_file);
                        if (open_file->closePend) /* list could have changed */
                                goto refind_writable;
@@ -1373,7 +1352,7 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
        if (open_file) {
                bytes_written = cifs_write(open_file->pfile, write_data,
                                           to-from, &offset);
-                atomic_dec(&open_file->wrtPending);
+                cifsFileInfo_put(open_file);
                /* Does mm or vfs already set times? */
                inode->i_atime = inode->i_mtime = current_fs_time(inode->i_sb);
                if ((bytes_written > 0) && (offset))
@@ -1562,7 +1541,7 @@ retry:
                                                   bytes_to_write, offset,
                                                   &bytes_written, iov, n_iov,
                                                   long_op);
-                                atomic_dec(&open_file->wrtPending);
+                                cifsFileInfo_put(open_file);
                                cifs_update_eof(cifsi, offset, bytes_written);
                                if (rc || bytes_written < bytes_to_write) {
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 82d83839655e..1f09c7619319 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -800,7 +800,7 @@ set_via_filehandle:
        if (open_file == NULL)
                CIFSSMBClose(xid, pTcon, netfid);
        else
-                atomic_dec(&open_file->wrtPending);
+                cifsFileInfo_put(open_file);
 out:
        return rc;
 }
@@ -1635,7 +1635,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
                __u32 npid = open_file->pid;
                rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid,
                                        npid, false);
-                atomic_dec(&open_file->wrtPending);
+                cifsFileInfo_put(open_file);
                cFYI(1, ("SetFSize for attrs rc = %d", rc));
                if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
                        unsigned int bytes_written;
@@ -1790,7 +1790,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
                u16 nfid = open_file->netfid;
                u32 npid = open_file->pid;
                rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, nfid, npid);
-                atomic_dec(&open_file->wrtPending);
+                cifsFileInfo_put(open_file);
        } else {
                rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, args,
                                    cifs_sb->local_nls,
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 0ad3e2d116a6..1da4ab250eae 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -119,20 +119,19 @@ AllocOplockQEntry(struct inode *pinode, __u16 fid, struct cifsTconInfo *tcon)
                temp->pinode = pinode;
                temp->tcon = tcon;
                temp->netfid = fid;
-                spin_lock(&GlobalMid_Lock);
+                spin_lock(&cifs_oplock_lock);
-                list_add_tail(&temp->qhead, &GlobalOplock_Q);
+                list_add_tail(&temp->qhead, &cifs_oplock_list);
-                spin_unlock(&GlobalMid_Lock);
+                spin_unlock(&cifs_oplock_lock);
        }
        return temp;
 }
 void DeleteOplockQEntry(struct oplock_q_entry *oplockEntry)
 {
-        spin_lock(&GlobalMid_Lock);
+        spin_lock(&cifs_oplock_lock);
    /* should we check if list empty first? */
        list_del(&oplockEntry->qhead);
-        spin_unlock(&GlobalMid_Lock);
+        spin_unlock(&cifs_oplock_lock);
        kmem_cache_free(cifs_oplock_cachep, oplockEntry);
 }
@@ -144,14 +143,14 @@ void DeleteTconOplockQEntries(struct cifsTconInfo *tcon)
        if (tcon == NULL)
                return;
-        spin_lock(&GlobalMid_Lock);
+        spin_lock(&cifs_oplock_lock);
-        list_for_each_entry(temp, &GlobalOplock_Q, qhead) {
+        list_for_each_entry(temp, &cifs_oplock_list, qhead) {
                if ((temp->tcon) && (temp->tcon == tcon)) {
                        list_del(&temp->qhead);
                        kmem_cache_free(cifs_oplock_cachep, temp);
                }
        }
-        spin_unlock(&GlobalMid_Lock);
+        spin_unlock(&cifs_oplock_lock);
 }
 static int
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index ccc9d62c462d..55ea369f43a9 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -63,7 +63,7 @@ static int send_data(struct sk_buff *skb)
                return rv;
        }
-        return genlmsg_unicast(skb, listener_nlpid);
+        return genlmsg_unicast(&init_net, skb, listener_nlpid);
 }
 static int user_cmd(struct sk_buff *skb, struct genl_info *info)
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index e27130341d4f..1c1638f873a4 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -482,7 +482,7 @@ static int ext2_alloc_branch(struct inode *inode,
                unlock_buffer(bh);
                mark_buffer_dirty_inode(bh, inode);
                /* We used to sync bh here if IS_SYNC(inode).
-                 * But we now rely upon generic_osync_inode()
+                 * But we now rely upon generic_write_sync()
                 * and b_inode_buffers.  But not for directories.
                 */
                if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode))
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 299253214789..388bbdfa0b4e 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -51,71 +51,12 @@ static int ext3_release_file (struct inode * inode, struct file * filp)
        return 0;
 }
-static ssize_t
-ext3_file_write(struct kiocb *iocb, const struct iovec *iov,
-                unsigned long nr_segs, loff_t pos)
-{
-        struct file *file = iocb->ki_filp;
-        struct inode *inode = file->f_path.dentry->d_inode;
-        ssize_t ret;
-        int err;
-        ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
-        /*
-         * Skip flushing if there was an error, or if nothing was written.
-         */
-        if (ret <= 0)
-                return ret;
-        /*
-         * If the inode is IS_SYNC, or is O_SYNC and we are doing data
-         * journalling then we need to make sure that we force the transaction
-         * to disk to keep all metadata uptodate synchronously.
-         */
-        if (file->f_flags & O_SYNC) {
-                /*
-                 * If we are non-data-journaled, then the dirty data has
-                 * already been flushed to backing store by generic_osync_inode,
-                 * and the inode has been flushed too if there have been any
-                 * modifications other than mere timestamp updates.
-                 *
-                 * Open question --- do we care about flushing timestamps too
-                 * if the inode is IS_SYNC?
-                 */
-                if (!ext3_should_journal_data(inode))
-                        return ret;
-                goto force_commit;
-        }
-        /*
-         * So we know that there has been no forced data flush.  If the inode
-         * is marked IS_SYNC, we need to force one ourselves.
-         */
-        if (!IS_SYNC(inode))
-                return ret;
-        /*
-         * Open question #2 --- should we force data to disk here too?  If we
-         * don't, the only impact is that data=writeback filesystems won't
-         * flush data to disk automatically on IS_SYNC, only metadata (but
-         * historically, that is what ext2 has done.)
-         */
-force_commit:
-        err = ext3_force_commit(inode->i_sb);
-        if (err)
-                return err;
-        return ret;
-}
 const struct file_operations ext3_file_operations = {
        .llseek         = generic_file_llseek,
        .read           = do_sync_read,
        .write          = do_sync_write,
        .aio_read       = generic_file_aio_read,
-        .aio_write      = ext3_file_write,
+        .aio_write      = generic_file_aio_write,
        .unlocked_ioctl = ext3_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ext3_compat_ioctl,
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 27f3c5354c0e..5ca3eca70a1e 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -58,10 +58,7 @@ static ssize_t
 ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
                unsigned long nr_segs, loff_t pos)
 {
-        struct file *file = iocb->ki_filp;
+        struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
-        struct inode *inode = file->f_path.dentry->d_inode;
-        ssize_t ret;
-        int err;
        /*
         * If we have encountered a bitmap-format file, the size limit
@@ -81,53 +78,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
                }
        }
-        ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
+        return generic_file_aio_write(iocb, iov, nr_segs, pos);
-        /*
-         * Skip flushing if there was an error, or if nothing was written.
-         */
-        if (ret <= 0)
-                return ret;
-        /*
-         * If the inode is IS_SYNC, or is O_SYNC and we are doing data
-         * journalling then we need to make sure that we force the transaction
-         * to disk to keep all metadata uptodate synchronously.
-         */
-        if (file->f_flags & O_SYNC) {
-                /*
-                 * If we are non-data-journaled, then the dirty data has
-                 * already been flushed to backing store by generic_osync_inode,
-                 * and the inode has been flushed too if there have been any
-                 * modifications other than mere timestamp updates.
-                 *
-                 * Open question --- do we care about flushing timestamps too
-                 * if the inode is IS_SYNC?
-                 */
-                if (!ext4_should_journal_data(inode))
-                        return ret;
-                goto force_commit;
-        }
-        /*
-         * So we know that there has been no forced data flush.  If the inode
-         * is marked IS_SYNC, we need to force one ourselves.
-         */
-        if (!IS_SYNC(inode))
-                return ret;
-        /*
-         * Open question #2 --- should we force data to disk here too?  If we
-         * don't, the only impact is that data=writeback filesystems won't
-         * flush data to disk automatically on IS_SYNC, only metadata (but
-         * historically, that is what ext2 has done.)
-         */
-force_commit:
-        err = ext4_force_commit(inode->i_sb);
-        if (err)
-                return err;
-        return ret;
 }
 static struct vm_operations_struct ext4_file_vm_ops = {
diff --git a/fs/fat/file.c b/fs/fat/file.c
index f042b965c95c..e8c159de236b 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -176,8 +176,26 @@ static int fat_cont_expand(struct inode *inode, loff_t size)
        inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
        mark_inode_dirty(inode);
-        if (IS_SYNC(inode))
+        if (IS_SYNC(inode)) {
-                err = sync_page_range_nolock(inode, mapping, start, count);
+                int err2;
+                /*
+                 * Opencode syncing since we don't have a file open to use
+                 * standard fsync path.
+                 */
+                err = filemap_fdatawrite_range(mapping, start,
+                                               start + count - 1);
+                err2 = sync_mapping_buffers(mapping);
+                if (!err)
+                        err = err2;
+                err2 = write_inode_now(inode, 1);
+                if (!err)
+                        err = err2;
+                if (!err) {
+                        err =  filemap_fdatawait_range(mapping, start,
+                                                       start + count - 1);
+                }
+        }
 out:
        return err;
 }
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index a6c20473dfd7..4e35be873e09 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -119,8 +119,8 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster)
                MSDOS_I(inode)->i_start = new_dclus;
                MSDOS_I(inode)->i_logstart = new_dclus;
                /*
-                 * Since generic_osync_inode() synchronize later if
+                 * Since generic_write_sync() synchronizes regular files later,
-                 * this is not directory, we don't here.
+                 * we sync here only directories.
                 */
                if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) {
                        ret = fat_sync_inode(inode);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index da86ef58e427..8e1e5e19d21e 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -35,21 +35,29 @@
 int nr_pdflush_threads;
 /*
+ * Passed into wb_writeback(), essentially a subset of writeback_control
+ */
+struct wb_writeback_args {
+        long nr_pages;
+        struct super_block *sb;
+        enum writeback_sync_modes sync_mode;
+        int for_kupdate;
+        int range_cyclic;
+};
+/*
 * Work items for the bdi_writeback threads
 */
 struct bdi_work {
-        struct list_head list;
+        struct list_head list;          /* pending work list */
-        struct list_head wait_list;
+        struct rcu_head rcu_head;       /* for RCU free/clear of work */
-        struct rcu_head rcu_head;
-        unsigned long seen;
+        unsigned long seen;             /* threads that have seen this work */
-        atomic_t pending;
+        atomic_t pending;               /* number of threads still to do work */
-        struct super_block *sb;
+        struct wb_writeback_args args;  /* writeback arguments */
-        unsigned long nr_pages;
-        enum writeback_sync_modes sync_mode;
-        unsigned long state;
+        unsigned long state;            /* flag bits, see WS_* */
 };
 enum {
@@ -66,22 +74,13 @@ static inline bool bdi_work_on_stack(struct bdi_work *work)
 }
 static inline void bdi_work_init(struct bdi_work *work,
-                                 struct writeback_control *wbc)
+                                 struct wb_writeback_args *args)
 {
        INIT_RCU_HEAD(&work->rcu_head);
-        work->sb = wbc->sb;
+        work->args = *args;
-        work->nr_pages = wbc->nr_to_write;
-        work->sync_mode = wbc->sync_mode;
        work->state = WS_USED;
 }
-static inline void bdi_work_init_on_stack(struct bdi_work *work,
-                                          struct writeback_control *wbc)
-{
-        bdi_work_init(work, wbc);
-        work->state |= WS_ONSTACK;
-}
 /**
 * writeback_in_progress - determine whether there is writeback in progress
 * @bdi: the device's backing_dev_info structure.
@@ -98,6 +97,11 @@ static void bdi_work_clear(struct bdi_work *work)
 {
        clear_bit(WS_USED_B, &work->state);
        smp_mb__after_clear_bit();
+        /*
+         * work can have disappeared at this point. bit waitq functions
+         * should be able to tolerate this, provided bdi_sched_wait does
+         * not dereference it's pointer argument.
+        */
        wake_up_bit(&work->state, WS_USED_B);
 }
@@ -113,7 +117,8 @@ static void bdi_work_free(struct rcu_head *head)
 static void wb_work_complete(struct bdi_work *work)
 {
-        const enum writeback_sync_modes sync_mode = work->sync_mode;
+        const enum writeback_sync_modes sync_mode = work->args.sync_mode;
+        int onstack = bdi_work_on_stack(work);
        /*
         * For allocated work, we can clear the done/seen bit right here.
@@ -121,9 +126,9 @@ static void wb_work_complete(struct bdi_work *work)
         * to after the RCU grace period, since the stack could be invalidated
         * as soon as bdi_work_clear() has done the wakeup.
         */
-        if (!bdi_work_on_stack(work))
+        if (!onstack)
                bdi_work_clear(work);
-        if (sync_mode == WB_SYNC_NONE || bdi_work_on_stack(work))
+        if (sync_mode == WB_SYNC_NONE || onstack)
                call_rcu(&work->rcu_head, bdi_work_free);
 }
@@ -146,21 +151,19 @@ static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
 static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
 {
-        if (work) {
+        work->seen = bdi->wb_mask;
-                work->seen = bdi->wb_mask;
+        BUG_ON(!work->seen);
-                BUG_ON(!work->seen);
+        atomic_set(&work->pending, bdi->wb_cnt);
-                atomic_set(&work->pending, bdi->wb_cnt);
+        BUG_ON(!bdi->wb_cnt);
-                BUG_ON(!bdi->wb_cnt);
-                /*
-                 * Make sure stores are seen before it appears on the list
-                 */
-                smp_mb();
-                spin_lock(&bdi->wb_lock);
+        /*
-                list_add_tail_rcu(&work->list, &bdi->work_list);
+         * list_add_tail_rcu() contains the necessary barriers to
-                spin_unlock(&bdi->wb_lock);
+         * make sure the above stores are seen before the item is
-        }
+         * noticed on the list
+         */
+        spin_lock(&bdi->wb_lock);
+        list_add_tail_rcu(&work->list, &bdi->work_list);
+        spin_unlock(&bdi->wb_lock);
        /*
         * If the default thread isn't there, make sure we add it. When
@@ -171,15 +174,7 @@ static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
        else {
                struct bdi_writeback *wb = &bdi->wb;
-                /*
+                if (wb->task)
-                 * If we failed allocating the bdi work item, wake up the wb
-                 * thread always. As a safety precaution, it'll flush out
-                 * everything
-                 */
-                if (!wb_has_dirty_io(wb)) {
-                        if (work)
-                                wb_clear_pending(wb, work);
-                } else if (wb->task)
                        wake_up_process(wb->task);
        }
 }
@@ -194,48 +189,75 @@ static void bdi_wait_on_work_clear(struct bdi_work *work)
                    TASK_UNINTERRUPTIBLE);
 }
-static struct bdi_work *bdi_alloc_work(struct writeback_control *wbc)
+static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
+                                 struct wb_writeback_args *args)
 {
        struct bdi_work *work;
+        /*
+         * This is WB_SYNC_NONE writeback, so if allocation fails just
+         * wakeup the thread for old dirty data writeback
+         */
        work = kmalloc(sizeof(*work), GFP_ATOMIC);
-        if (work)
+        if (work) {
-                bdi_work_init(work, wbc);
+                bdi_work_init(work, args);
+                bdi_queue_work(bdi, work);
+        } else {
+                struct bdi_writeback *wb = &bdi->wb;
-        return work;
+                if (wb->task)
+                        wake_up_process(wb->task);
+        }
 }
-void bdi_start_writeback(struct writeback_control *wbc)
+/**
+ * bdi_sync_writeback - start and wait for writeback
+ * @bdi: the backing device to write from
+ * @sb: write inodes from this super_block
+ *
+ * Description:
+ *   This does WB_SYNC_ALL data integrity writeback and waits for the
+ *   IO to complete. Callers must hold the sb s_umount semaphore for
+ *   reading, to avoid having the super disappear before we are done.
+ */
+static void bdi_sync_writeback(struct backing_dev_info *bdi,
+                               struct super_block *sb)
 {
-        const bool must_wait = wbc->sync_mode == WB_SYNC_ALL;
+        struct wb_writeback_args args = {
-        struct bdi_work work_stack, *work = NULL;
+                .sb             = sb,
+                .sync_mode      = WB_SYNC_ALL,
+                .nr_pages       = LONG_MAX,
+                .range_cyclic   = 0,
+        };
+        struct bdi_work work;
-        if (!must_wait)
+        bdi_work_init(&work, &args);
-                work = bdi_alloc_work(wbc);
+        work.state |= WS_ONSTACK;
-        if (!work) {
+        bdi_queue_work(bdi, &work);
-                work = &work_stack;
+        bdi_wait_on_work_clear(&work);
-                bdi_work_init_on_stack(work, wbc);
+}
-        }
-        bdi_queue_work(wbc->bdi, work);
+/**
+ * bdi_start_writeback - start writeback
+ * @bdi: the backing device to write from
+ * @nr_pages: the number of pages to write
+ *
+ * Description:
+ *   This does WB_SYNC_NONE opportunistic writeback. The IO is only
+ *   started when this function returns, we make no guarentees on
+ *   completion. Caller need not hold sb s_umount semaphore.
+ *
+ */
+void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
+{
+        struct wb_writeback_args args = {
+                .sync_mode      = WB_SYNC_NONE,
+                .nr_pages       = nr_pages,
+                .range_cyclic   = 1,
+        };
-        /*
+        bdi_alloc_queue_work(bdi, &args);
-         * If the sync mode is WB_SYNC_ALL, block waiting for the work to
-         * complete. If not, we only need to wait for the work to be started,
-         * if we allocated it on-stack. We use the same mechanism, if the
-         * wait bit is set in the bdi_work struct, then threads will not
-         * clear pending until after they are done.
-         *
-         * Note that work == &work_stack if must_wait is true, so we don't
-         * need to do call_rcu() here ever, since the completion path will
-         * have done that for us.
-         */
-        if (must_wait || work == &work_stack) {
-                bdi_wait_on_work_clear(work);
-                if (work != &work_stack)
-                        call_rcu(&work->rcu_head, bdi_work_free);
-        }
 }
 /*
@@ -671,17 +693,16 @@ static inline bool over_bground_thresh(void)
 * older_than_this takes precedence over nr_to_write.  So we'll only write back
 * all dirty pages if they are all attached to "old" mappings.
 */
-static long wb_writeback(struct bdi_writeback *wb, long nr_pages,
+static long wb_writeback(struct bdi_writeback *wb,
-                         struct super_block *sb,
+                         struct wb_writeback_args *args)
-                         enum writeback_sync_modes sync_mode, int for_kupdate)
 {
        struct writeback_control wbc = {
                .bdi                    = wb->bdi,
-                .sb                     = sb,
+                .sb                     = args->sb,
-                .sync_mode              = sync_mode,
+                .sync_mode              = args->sync_mode,
                .older_than_this        = NULL,
-                .for_kupdate            = for_kupdate,
+                .for_kupdate            = args->for_kupdate,
-                .range_cyclic           = 1,
+                .range_cyclic           = args->range_cyclic,
        };
        unsigned long oldest_jif;
        long wrote = 0;
@@ -691,13 +712,18 @@ static long wb_writeback(struct bdi_writeback *wb, long nr_pages,
                oldest_jif = jiffies -
                                msecs_to_jiffies(dirty_expire_interval * 10);
        }
+        if (!wbc.range_cyclic) {
+                wbc.range_start = 0;
+                wbc.range_end = LLONG_MAX;
+        }
        for (;;) {
                /*
                 * Don't flush anything for non-integrity writeback where
                 * no nr_pages was given
                 */
-                if (!for_kupdate && nr_pages <= 0 && sync_mode == WB_SYNC_NONE)
+                if (!args->for_kupdate && args->nr_pages <= 0 &&
+                     args->sync_mode == WB_SYNC_NONE)
                        break;
                /*
@@ -705,7 +731,8 @@ static long wb_writeback(struct bdi_writeback *wb, long nr_pages,
                 * periodic background writeout and we are below the
                 * background dirty threshold, don't do anything
                 */
-                if (for_kupdate && nr_pages <= 0 && !over_bground_thresh())
+                if (args->for_kupdate && args->nr_pages <= 0 &&
+                    !over_bground_thresh())
                        break;
                wbc.more_io = 0;
@@ -713,7 +740,7 @@ static long wb_writeback(struct bdi_writeback *wb, long nr_pages,
                wbc.nr_to_write = MAX_WRITEBACK_PAGES;
                wbc.pages_skipped = 0;
                writeback_inodes_wb(wb, &wbc);
-                nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+                args->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
                wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
                /*
@@ -731,7 +758,11 @@ static long wb_writeback(struct bdi_writeback *wb, long nr_pages,
 /*
 * Return the next bdi_work struct that hasn't been processed by this
- * wb thread yet
+ * wb thread yet. ->seen is initially set for each thread that exists
+ * for this device, when a thread first notices a piece of work it
+ * clears its bit. Depending on writeback type, the thread will notify
+ * completion on either receiving the work (WB_SYNC_NONE) or after
+ * it is done (WB_SYNC_ALL).
 */
 static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi,
                                           struct bdi_writeback *wb)
@@ -741,8 +772,9 @@ static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi,
        rcu_read_lock();
        list_for_each_entry_rcu(work, &bdi->work_list, list) {
-                if (!test_and_clear_bit(wb->nr, &work->seen))
+                if (!test_bit(wb->nr, &work->seen))
                        continue;
+                clear_bit(wb->nr, &work->seen);
                ret = work;
                break;
@@ -767,8 +799,16 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
                        global_page_state(NR_UNSTABLE_NFS) +
                        (inodes_stat.nr_inodes - inodes_stat.nr_unused);
-        if (nr_pages)
+        if (nr_pages) {
-                return wb_writeback(wb, nr_pages, NULL, WB_SYNC_NONE, 1);
+                struct wb_writeback_args args = {
+                        .nr_pages       = nr_pages,
+                        .sync_mode      = WB_SYNC_NONE,
+                        .for_kupdate    = 1,
+                        .range_cyclic   = 1,
+                };
+                return wb_writeback(wb, &args);
+        }
        return 0;
 }
@@ -780,35 +820,31 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
 {
        struct backing_dev_info *bdi = wb->bdi;
        struct bdi_work *work;
-        long nr_pages, wrote = 0;
+        long wrote = 0;
        while ((work = get_next_work_item(bdi, wb)) != NULL) {
-                enum writeback_sync_modes sync_mode;
+                struct wb_writeback_args args = work->args;
-                nr_pages = work->nr_pages;
                /*
                 * Override sync mode, in case we must wait for completion
                 */
                if (force_wait)
-                        work->sync_mode = sync_mode = WB_SYNC_ALL;
+                        work->args.sync_mode = args.sync_mode = WB_SYNC_ALL;
-                else
-                        sync_mode = work->sync_mode;
                /*
                 * If this isn't a data integrity operation, just notify
                 * that we have seen this work and we are now starting it.
                 */
-                if (sync_mode == WB_SYNC_NONE)
+                if (args.sync_mode == WB_SYNC_NONE)
                        wb_clear_pending(wb, work);
-                wrote += wb_writeback(wb, nr_pages, work->sb, sync_mode, 0);
+                wrote += wb_writeback(wb, &args);
                /*
                 * This is a data integrity writeback, so only do the
                 * notification when we have completed the work.
                 */
-                if (sync_mode == WB_SYNC_ALL)
+                if (args.sync_mode == WB_SYNC_ALL)
                        wb_clear_pending(wb, work);
        }
@@ -849,8 +885,7 @@ int bdi_writeback_task(struct bdi_writeback *wb)
                }
                wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
-                set_current_state(TASK_INTERRUPTIBLE);
+                schedule_timeout_interruptible(wait_jiffies);
-                schedule_timeout(wait_jiffies);
                try_to_freeze();
        }
@@ -858,67 +893,28 @@ int bdi_writeback_task(struct bdi_writeback *wb)
 }
 /*
- * Schedule writeback for all backing devices. Expensive! If this is a data
+ * Schedule writeback for all backing devices. This does WB_SYNC_NONE
- * integrity operation, writeback will be complete when this returns. If
+ * writeback, for integrity writeback see bdi_sync_writeback().
- * we are simply called for WB_SYNC_NONE, then writeback will merely be
- * scheduled to run.
 */
-static void bdi_writeback_all(struct writeback_control *wbc)
+static void bdi_writeback_all(struct super_block *sb, long nr_pages)
 {
-        const bool must_wait = wbc->sync_mode == WB_SYNC_ALL;
+        struct wb_writeback_args args = {
+                .sb             = sb,
+                .nr_pages       = nr_pages,
+                .sync_mode      = WB_SYNC_NONE,
+        };
        struct backing_dev_info *bdi;
-        struct bdi_work *work;
-        LIST_HEAD(list);
-restart:
+        rcu_read_lock();
-        spin_lock(&bdi_lock);
-        list_for_each_entry(bdi, &bdi_list, bdi_list) {
-                struct bdi_work *work;
+        list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
                if (!bdi_has_dirty_io(bdi))
                        continue;
-                /*
+                bdi_alloc_queue_work(bdi, &args);
-                 * If work allocation fails, do the writes inline. We drop
-                 * the lock and restart the list writeout. This should be OK,
-                 * since this happens rarely and because the writeout should
-                 * eventually make more free memory available.
-                 */
-                work = bdi_alloc_work(wbc);
-                if (!work) {
-                        struct writeback_control __wbc;
-                        /*
-                         * Not a data integrity writeout, just continue
-                         */
-                        if (!must_wait)
-                                continue;
-                        spin_unlock(&bdi_lock);
-                        __wbc = *wbc;
-                        __wbc.bdi = bdi;
-                        writeback_inodes_wbc(&__wbc);
-                        goto restart;
-                }
-                if (must_wait)
-                        list_add_tail(&work->wait_list, &list);
-                bdi_queue_work(bdi, work);
        }
-        spin_unlock(&bdi_lock);
+        rcu_read_unlock();
-        /*
-         * If this is for WB_SYNC_ALL, wait for pending work to complete
-         * before returning.
-         */
-        while (!list_empty(&list)) {
-                work = list_entry(list.next, struct bdi_work, wait_list);
-                list_del(&work->wait_list);
-                bdi_wait_on_work_clear(work);
-                call_rcu(&work->rcu_head, bdi_work_free);
-        }
 }
 /*
@@ -927,17 +923,10 @@ restart:
 */
 void wakeup_flusher_threads(long nr_pages)
 {
-        struct writeback_control wbc = {
-                .sync_mode      = WB_SYNC_NONE,
-                .older_than_this = NULL,
-                .range_cyclic   = 1,
-        };
        if (nr_pages == 0)
                nr_pages = global_page_state(NR_FILE_DIRTY) +
                                global_page_state(NR_UNSTABLE_NFS);
-        wbc.nr_to_write = nr_pages;
+        bdi_writeback_all(NULL, nr_pages);
-        bdi_writeback_all(&wbc);
 }
 static noinline void block_dump___mark_inode_dirty(struct inode *inode)
@@ -1084,7 +1073,7 @@ EXPORT_SYMBOL(__mark_inode_dirty);
 * on the writer throttling path, and we get decent balancing between many
 * throttled threads: we don't want them all piling up on inode_sync_wait.
 */
-static void wait_sb_inodes(struct writeback_control *wbc)
+static void wait_sb_inodes(struct super_block *sb)
 {
        struct inode *inode, *old_inode = NULL;
@@ -1092,7 +1081,7 @@ static void wait_sb_inodes(struct writeback_control *wbc)
         * We need to be protected against the filesystem going from
         * r/o to r/w or vice versa.
         */
-        WARN_ON(!rwsem_is_locked(&wbc->sb->s_umount));
+        WARN_ON(!rwsem_is_locked(&sb->s_umount));
        spin_lock(&inode_lock);
@@ -1103,7 +1092,7 @@ static void wait_sb_inodes(struct writeback_control *wbc)
         * In which case, the inode may not be on the dirty list, but
         * we still have to wait for that writeout.
         */
-        list_for_each_entry(inode, &wbc->sb->s_inodes, i_sb_list) {
+        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
                struct address_space *mapping;
                if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
@@ -1143,14 +1132,8 @@ static void wait_sb_inodes(struct writeback_control *wbc)
 * for IO completion of submitted IO. The number of pages submitted is
 * returned.
 */
-long writeback_inodes_sb(struct super_block *sb)
+void writeback_inodes_sb(struct super_block *sb)
 {
-        struct writeback_control wbc = {
-                .sb             = sb,
-                .sync_mode      = WB_SYNC_NONE,
-                .range_start    = 0,
-                .range_end      = LLONG_MAX,
-        };
        unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
        unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
        long nr_to_write;
@@ -1158,9 +1141,7 @@ long writeback_inodes_sb(struct super_block *sb)
        nr_to_write = nr_dirty + nr_unstable +
                        (inodes_stat.nr_inodes - inodes_stat.nr_unused);
-        wbc.nr_to_write = nr_to_write;
+        bdi_writeback_all(sb, nr_to_write);
-        bdi_writeback_all(&wbc);
-        return nr_to_write - wbc.nr_to_write;
 }
 EXPORT_SYMBOL(writeback_inodes_sb);
@@ -1171,20 +1152,10 @@ EXPORT_SYMBOL(writeback_inodes_sb);
 * This function writes and waits on any dirty inode belonging to this
 * super_block. The number of pages synced is returned.
 */
-long sync_inodes_sb(struct super_block *sb)
+void sync_inodes_sb(struct super_block *sb)
 {
-        struct writeback_control wbc = {
+        bdi_sync_writeback(sb->s_bdi, sb);
-                .sb             = sb,
+        wait_sb_inodes(sb);
-                .sync_mode      = WB_SYNC_ALL,
-                .range_start    = 0,
-                .range_end      = LLONG_MAX,
-        };
-        long nr_to_write = LONG_MAX; /* doesn't actually matter */
-        wbc.nr_to_write = nr_to_write;
-        bdi_writeback_all(&wbc);
-        wait_sb_inodes(&wbc);
-        return nr_to_write - wbc.nr_to_write;
 }
 EXPORT_SYMBOL(sync_inodes_sb);
@@ -1242,57 +1213,3 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
        return ret;
 }
 EXPORT_SYMBOL(sync_inode);
-/**
- * generic_osync_inode - flush all dirty data for a given inode to disk
- * @inode: inode to write
- * @mapping: the address_space that should be flushed
- * @what:  what to write and wait upon
- *
- * This can be called by file_write functions for files which have the
- * O_SYNC flag set, to flush dirty writes to disk.
- *
- * @what is a bitmask, specifying which part of the inode's data should be
- * written and waited upon.
- *
- *    OSYNC_DATA:     i_mapping's dirty data
- *    OSYNC_METADATA: the buffers at i_mapping->private_list
- *    OSYNC_INODE:    the inode itself
- */
-int generic_osync_inode(struct inode *inode, struct address_space *mapping, int what)
-{
-        int err = 0;
-        int need_write_inode_now = 0;
-        int err2;
-        if (what & OSYNC_DATA)
-                err = filemap_fdatawrite(mapping);
-        if (what & (OSYNC_METADATA|OSYNC_DATA)) {
-                err2 = sync_mapping_buffers(mapping);
-                if (!err)
-                        err = err2;
-        }
-        if (what & OSYNC_DATA) {
-                err2 = filemap_fdatawait(mapping);
-                if (!err)
-                        err = err2;
-        }
-        spin_lock(&inode_lock);
-        if ((inode->i_state & I_DIRTY) &&
-            ((what & OSYNC_INODE) || (inode->i_state & I_DIRTY_DATASYNC)))
-                need_write_inode_now = 1;
-        spin_unlock(&inode_lock);
-        if (need_write_inode_now) {
-                err2 = write_inode_now(inode, 1);
-                if (!err)
-                        err = err2;
-        }
-        else
-                inode_sync_wait(inode);
-        return err;
-}
-EXPORT_SYMBOL(generic_osync_inode);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 4567db6f9430..e5dbecd87b0f 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -894,6 +894,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
        if (err)
                goto err_put_conn;
+        sb->s_bdi = &fc->bdi;
        /* Handle umasking inside the fuse code */
        if (sb->s_flags & MS_POSIXACL)
                fc->dont_mask = 1;
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index 3da2f1f4f738..21f7e46da4c0 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,6 +1,6 @@
 EXTRA_CFLAGS := -I$(src)
 obj-$(CONFIG_GFS2_FS) += gfs2.o
-gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \
+gfs2-y := acl.o bmap.o dir.o xattr.o glock.o \
        glops.o inode.o log.o lops.o main.o meta_io.o \
        aops.o dentry.o export.o file.o \
        ops_fstype.o ops_inode.o quota.o \
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index fa881bdc3d85..3fc4e3ac7d84 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -19,8 +19,7 @@
 #include "gfs2.h"
 #include "incore.h"
 #include "acl.h"
-#include "eaops.h"
+#include "xattr.h"
-#include "eattr.h"
 #include "glock.h"
 #include "inode.h"
 #include "meta_io.h"
@@ -31,8 +30,7 @@
 #define ACL_DEFAULT 0
 int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
-                      struct gfs2_ea_request *er,
+                          struct gfs2_ea_request *er, int *remove, mode_t *mode)
-                      int *remove, mode_t *mode)
 {
        struct posix_acl *acl;
        int error;
@@ -83,30 +81,20 @@ int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access)
        return 0;
 }
-static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl,
+static int acl_get(struct gfs2_inode *ip, const char *name,
-                   struct gfs2_ea_location *el, char **data, unsigned int *len)
+                   struct posix_acl **acl, struct gfs2_ea_location *el,
+                   char **datap, unsigned int *lenp)
 {
-        struct gfs2_ea_request er;
+        char *data;
-        struct gfs2_ea_location el_this;
+        unsigned int len;
        int error;
+        el->el_bh = NULL;
        if (!ip->i_eattr)
                return 0;
-        memset(&er, 0, sizeof(struct gfs2_ea_request));
+        error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, name, el);
-        if (access) {
-                er.er_name = GFS2_POSIX_ACL_ACCESS;
-                er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
-        } else {
-                er.er_name = GFS2_POSIX_ACL_DEFAULT;
-                er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
-        }
-        er.er_type = GFS2_EATYPE_SYS;
-        if (!el)
-                el = &el_this;
-        error = gfs2_ea_find(ip, &er, el);
        if (error)
                return error;
        if (!el->el_ea)
@@ -114,32 +102,31 @@ static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl,
        if (!GFS2_EA_DATA_LEN(el->el_ea))
                goto out;
-        er.er_data_len = GFS2_EA_DATA_LEN(el->el_ea);
+        len = GFS2_EA_DATA_LEN(el->el_ea);
-        er.er_data = kmalloc(er.er_data_len, GFP_NOFS);
+        data = kmalloc(len, GFP_NOFS);
        error = -ENOMEM;
-        if (!er.er_data)
+        if (!data)
                goto out;
-        error = gfs2_ea_get_copy(ip, el, er.er_data);
+        error = gfs2_ea_get_copy(ip, el, data, len);
-        if (error)
+        if (error < 0)
                goto out_kfree;
+        error = 0;
        if (acl) {
-                *acl = posix_acl_from_xattr(er.er_data, er.er_data_len);
+                *acl = posix_acl_from_xattr(data, len);
                if (IS_ERR(*acl))
                        error = PTR_ERR(*acl);
        }
 out_kfree:
-        if (error || !data)
+        if (error || !datap) {
-                kfree(er.er_data);
+                kfree(data);
-        else {
+        } else {
-                *data = er.er_data;
+                *datap = data;
-                *len = er.er_data_len;
+                *lenp = len;
        }
 out:
-        if (error || el == &el_this)
-                brelse(el->el_bh);
        return error;
 }
@@ -153,10 +140,12 @@ out:
 int gfs2_check_acl(struct inode *inode, int mask)
 {
+        struct gfs2_ea_location el;
        struct posix_acl *acl = NULL;
        int error;
-        error = acl_get(GFS2_I(inode), ACL_ACCESS, &acl, NULL, NULL, NULL);
+        error = acl_get(GFS2_I(inode), GFS2_POSIX_ACL_ACCESS, &acl, &el, NULL, NULL);
+        brelse(el.el_bh);
        if (error)
                return error;
@@ -196,10 +185,12 @@ static int munge_mode(struct gfs2_inode *ip, mode_t mode)
 int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
 {
+        struct gfs2_ea_location el;
        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
        struct posix_acl *acl = NULL, *clone;
-        struct gfs2_ea_request er;
        mode_t mode = ip->i_inode.i_mode;
+        char *data = NULL;
+        unsigned int len;
        int error;
        if (!sdp->sd_args.ar_posix_acl)
@@ -207,11 +198,8 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
        if (S_ISLNK(ip->i_inode.i_mode))
                return 0;
-        memset(&er, 0, sizeof(struct gfs2_ea_request));
+        error = acl_get(dip, GFS2_POSIX_ACL_DEFAULT, &acl, &el, &data, &len);
-        er.er_type = GFS2_EATYPE_SYS;
+        brelse(el.el_bh);
-        error = acl_get(dip, ACL_DEFAULT, &acl, NULL,
-                        &er.er_data, &er.er_data_len);
        if (error)
                return error;
        if (!acl) {
@@ -229,9 +217,8 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
        acl = clone;
        if (S_ISDIR(ip->i_inode.i_mode)) {
-                er.er_name = GFS2_POSIX_ACL_DEFAULT;
+                error = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SYS,
-                er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
+                                       GFS2_POSIX_ACL_DEFAULT, data, len, 0);
-                error = gfs2_system_eaops.eo_set(ip, &er);
                if (error)
                        goto out;
        }
@@ -239,21 +226,19 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
        error = posix_acl_create_masq(acl, &mode);
        if (error < 0)
                goto out;
-        if (error > 0) {
+        if (error == 0)
-                er.er_name = GFS2_POSIX_ACL_ACCESS;
+                goto munge;
-                er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
-                posix_acl_to_xattr(acl, er.er_data, er.er_data_len);
-                er.er_mode = mode;
-                er.er_flags = GFS2_ERF_MODE;
-                error = gfs2_system_eaops.eo_set(ip, &er);
-                if (error)
-                        goto out;
-        } else
-                munge_mode(ip, mode);
+        posix_acl_to_xattr(acl, data, len);
+        error = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SYS,
+                               GFS2_POSIX_ACL_ACCESS, data, len, 0);
+        if (error)
+                goto out;
+munge:
+        error = munge_mode(ip, mode);
 out:
        posix_acl_release(acl);
-        kfree(er.er_data);
+        kfree(data);
        return error;
 }
@@ -265,9 +250,9 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
        unsigned int len;
        int error;
-        error = acl_get(ip, ACL_ACCESS, &acl, &el, &data, &len);
+        error = acl_get(ip, GFS2_POSIX_ACL_ACCESS, &acl, &el, &data, &len);
        if (error)
-                return error;
+                goto out_brelse;
        if (!acl)
                return gfs2_setattr_simple(ip, attr);
@@ -286,8 +271,9 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
 out:
        posix_acl_release(acl);
-        brelse(el.el_bh);
        kfree(data);
+out_brelse:
+        brelse(el.el_bh);
        return error;
 }
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 022c66cd5606..91beddadd388 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -107,8 +107,26 @@ static int gfs2_dhash(struct dentry *dentry, struct qstr *str)
        return 0;
 }
+static int gfs2_dentry_delete(struct dentry *dentry)
+{
+        struct gfs2_inode *ginode;
+        if (!dentry->d_inode)
+                return 0;
+        ginode = GFS2_I(dentry->d_inode);
+        if (!ginode->i_iopen_gh.gh_gl)
+                return 0;
+        if (test_bit(GLF_DEMOTE, &ginode->i_iopen_gh.gh_gl->gl_flags))
+                return 1;
+        return 0;
+}
 const struct dentry_operations gfs2_dops = {
        .d_revalidate = gfs2_drevalidate,
        .d_hash = gfs2_dhash,
+        .d_delete = gfs2_dentry_delete,
 };
diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c
deleted file mode 100644
index dee9b03e5b37..000000000000
--- a/fs/gfs2/eaops.c
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/completion.h>
-#include <linux/buffer_head.h>
-#include <linux/capability.h>
-#include <linux/xattr.h>
-#include <linux/gfs2_ondisk.h>
-#include <asm/uaccess.h>
-#include "gfs2.h"
-#include "incore.h"
-#include "acl.h"
-#include "eaops.h"
-#include "eattr.h"
-#include "util.h"
-/**
- * gfs2_ea_name2type - get the type of the ea, and truncate type from the name
- * @namep: ea name, possibly with type appended
- *
- * Returns: GFS2_EATYPE_XXX
- */
-unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name)
-{
-        unsigned int type;
-        if (strncmp(name, "system.", 7) == 0) {
-                type = GFS2_EATYPE_SYS;
-                if (truncated_name)
-                        *truncated_name = name + sizeof("system.") - 1;
-        } else if (strncmp(name, "user.", 5) == 0) {
-                type = GFS2_EATYPE_USR;
-                if (truncated_name)
-                        *truncated_name = name + sizeof("user.") - 1;
-        } else if (strncmp(name, "security.", 9) == 0) {
-                type = GFS2_EATYPE_SECURITY;
-                if (truncated_name)
-                        *truncated_name = name + sizeof("security.") - 1;
-        } else {
-                type = GFS2_EATYPE_UNUSED;
-                if (truncated_name)
-                        *truncated_name = NULL;
-        }
-        return type;
-}
-static int system_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
-        if (!GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) &&
-            !GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len) &&
-            !capable(CAP_SYS_ADMIN))
-                return -EPERM;
-        if (GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl == 0 &&
-            (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) ||
-             GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)))
-                return -EOPNOTSUPP;
-        return gfs2_ea_get_i(ip, er);
-}
-static int system_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
-        int remove = 0;
-        int error;
-        if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
-                if (!(er->er_flags & GFS2_ERF_MODE)) {
-                        er->er_mode = ip->i_inode.i_mode;
-                        er->er_flags |= GFS2_ERF_MODE;
-                }
-                error = gfs2_acl_validate_set(ip, 1, er,
-                                              &remove, &er->er_mode);
-                if (error)
-                        return error;
-                error = gfs2_ea_set_i(ip, er);
-                if (error)
-                        return error;
-                if (remove)
-                        gfs2_ea_remove_i(ip, er);
-                return 0;
-        } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
-                error = gfs2_acl_validate_set(ip, 0, er,
-                                              &remove, NULL);
-                if (error)
-                        return error;
-                if (!remove)
-                        error = gfs2_ea_set_i(ip, er);
-                else {
-                        error = gfs2_ea_remove_i(ip, er);
-                        if (error == -ENODATA)
-                                error = 0;
-                }
-                return error;
-        }
-        return -EPERM;
-}
-static int system_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
-        if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
-                int error = gfs2_acl_validate_remove(ip, 1);
-                if (error)
-                        return error;
-        } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
-                int error = gfs2_acl_validate_remove(ip, 0);
-                if (error)
-                        return error;
-        } else
-                return -EPERM;
-        return gfs2_ea_remove_i(ip, er);
-}
-static const struct gfs2_eattr_operations gfs2_user_eaops = {
-        .eo_get = gfs2_ea_get_i,
-        .eo_set = gfs2_ea_set_i,
-        .eo_remove = gfs2_ea_remove_i,
-        .eo_name = "user",
-};
-const struct gfs2_eattr_operations gfs2_system_eaops = {
-        .eo_get = system_eo_get,
-        .eo_set = system_eo_set,
-        .eo_remove = system_eo_remove,
-        .eo_name = "system",
-};
-static const struct gfs2_eattr_operations gfs2_security_eaops = {
-        .eo_get = gfs2_ea_get_i,
-        .eo_set = gfs2_ea_set_i,
-        .eo_remove = gfs2_ea_remove_i,
-        .eo_name = "security",
-};
-const struct gfs2_eattr_operations *gfs2_ea_ops[] = {
-        NULL,
-        &gfs2_user_eaops,
-        &gfs2_system_eaops,
-        &gfs2_security_eaops,
-};
diff --git a/fs/gfs2/eaops.h b/fs/gfs2/eaops.h
deleted file mode 100644
index da2f7fbbb40d..000000000000
--- a/fs/gfs2/eaops.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-#ifndef __EAOPS_DOT_H__
-#define __EAOPS_DOT_H__
-struct gfs2_ea_request;
-struct gfs2_inode;
-struct gfs2_eattr_operations {
-        int (*eo_get) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
-        int (*eo_set) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
-        int (*eo_remove) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
-        char *eo_name;
-};
-unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name);
-extern const struct gfs2_eattr_operations gfs2_system_eaops;
-extern const struct gfs2_eattr_operations *gfs2_ea_ops[];
-#endif /* __EAOPS_DOT_H__ */
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 9200ef221716..d15876e9aa26 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -143,17 +143,14 @@ static struct dentry *gfs2_get_parent(struct dentry *child)
 }
 static struct dentry *gfs2_get_dentry(struct super_block *sb,
-                struct gfs2_inum_host *inum)
+                                      struct gfs2_inum_host *inum)
 {
        struct gfs2_sbd *sdp = sb->s_fs_info;
-        struct gfs2_holder i_gh, ri_gh, rgd_gh;
+        struct gfs2_holder i_gh;
-        struct gfs2_rgrpd *rgd;
        struct inode *inode;
        struct dentry *dentry;
        int error;
-        /* System files? */
        inode = gfs2_ilookup(sb, inum->no_addr);
        if (inode) {
                if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) {
@@ -168,29 +165,11 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
        if (error)
                return ERR_PTR(error);
-        error = gfs2_rindex_hold(sdp, &ri_gh);
+        error = gfs2_check_blk_type(sdp, inum->no_addr, GFS2_BLKST_DINODE);
        if (error)
                goto fail;
-        error = -EINVAL;
+        inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0, 0);
-        rgd = gfs2_blk2rgrpd(sdp, inum->no_addr);
-        if (!rgd)
-                goto fail_rindex;
-        error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
-        if (error)
-                goto fail_rindex;
-        error = -ESTALE;
-        if (gfs2_get_block_type(rgd, inum->no_addr) != GFS2_BLKST_DINODE)
-                goto fail_rgd;
-        gfs2_glock_dq_uninit(&rgd_gh);
-        gfs2_glock_dq_uninit(&ri_gh);
-        inode = gfs2_inode_lookup(sb, DT_UNKNOWN,
-                                        inum->no_addr,
-                                        0, 0);
        if (IS_ERR(inode)) {
                error = PTR_ERR(inode);
                goto fail;
@@ -224,13 +203,6 @@ out_inode:
        if (!IS_ERR(dentry))
                dentry->d_op = &gfs2_dops;
        return dentry;
-fail_rgd:
-        gfs2_glock_dq_uninit(&rgd_gh);
-fail_rindex:
-        gfs2_glock_dq_uninit(&ri_gh);
 fail:
        gfs2_glock_dq_uninit(&i_gh);
        return ERR_PTR(error);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 73318a3ce6f1..166f38fbd246 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -38,7 +38,6 @@
 #include "rgrp.h"
 #include "trans.h"
 #include "util.h"
-#include "eaops.h"
 /**
 * gfs2_llseek - seek to a location in a file
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 61801ada36f0..6edb423f90b3 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -406,6 +406,12 @@ struct gfs2_statfs_change_host {
 #define GFS2_DATA_WRITEBACK     1
 #define GFS2_DATA_ORDERED       2
+#define GFS2_ERRORS_DEFAULT     GFS2_ERRORS_WITHDRAW
+#define GFS2_ERRORS_WITHDRAW    0
+#define GFS2_ERRORS_CONTINUE    1 /* place holder for future feature */
+#define GFS2_ERRORS_RO          2 /* place holder for future feature */
+#define GFS2_ERRORS_PANIC       3
 struct gfs2_args {
        char ar_lockproto[GFS2_LOCKNAME_LEN];   /* Name of the Lock Protocol */
        char ar_locktable[GFS2_LOCKNAME_LEN];   /* Name of the Lock Table */
@@ -422,6 +428,7 @@ struct gfs2_args {
        unsigned int ar_data:2;                 /* ordered/writeback */
        unsigned int ar_meta:1;                 /* mount metafs */
        unsigned int ar_discard:1;              /* discard requests */
+        unsigned int ar_errors:2;               /* errors=withdraw | panic */
        int ar_commit;                          /* Commit interval */
 };
@@ -489,7 +496,6 @@ struct gfs2_sb_host {
 */
 struct lm_lockstruct {
-        u32 ls_id;
        unsigned int ls_jid;
        unsigned int ls_first;
        unsigned int ls_first_done;
@@ -541,18 +547,12 @@ struct gfs2_sbd {
        struct dentry *sd_root_dir;
        struct inode *sd_jindex;
-        struct inode *sd_inum_inode;
        struct inode *sd_statfs_inode;
-        struct inode *sd_ir_inode;
        struct inode *sd_sc_inode;
        struct inode *sd_qc_inode;
        struct inode *sd_rindex;
        struct inode *sd_quota_inode;
-        /* Inum stuff */
-        struct mutex sd_inum_mutex;
        /* StatFS stuff */
        spinlock_t sd_statfs_spin;
@@ -580,7 +580,6 @@ struct gfs2_sbd {
        struct gfs2_holder sd_journal_gh;
        struct gfs2_holder sd_jinode_gh;
-        struct gfs2_holder sd_ir_gh;
        struct gfs2_holder sd_sc_gh;
        struct gfs2_holder sd_qc_gh;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 2f94bd723698..fb15d3b1f409 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -24,7 +24,7 @@
 #include "acl.h"
 #include "bmap.h"
 #include "dir.h"
-#include "eattr.h"
+#include "xattr.h"
 #include "glock.h"
 #include "glops.h"
 #include "inode.h"
@@ -519,139 +519,6 @@ out:
        return inode ? inode : ERR_PTR(error);
 }
-static void gfs2_inum_range_in(struct gfs2_inum_range_host *ir, const void *buf)
-{
-        const struct gfs2_inum_range *str = buf;
-        ir->ir_start = be64_to_cpu(str->ir_start);
-        ir->ir_length = be64_to_cpu(str->ir_length);
-}
-static void gfs2_inum_range_out(const struct gfs2_inum_range_host *ir, void *buf)
-{
-        struct gfs2_inum_range *str = buf;
-        str->ir_start = cpu_to_be64(ir->ir_start);
-        str->ir_length = cpu_to_be64(ir->ir_length);
-}
-static int pick_formal_ino_1(struct gfs2_sbd *sdp, u64 *formal_ino)
-{
-        struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode);
-        struct buffer_head *bh;
-        struct gfs2_inum_range_host ir;
-        int error;
-        error = gfs2_trans_begin(sdp, RES_DINODE, 0);
-        if (error)
-                return error;
-        mutex_lock(&sdp->sd_inum_mutex);
-        error = gfs2_meta_inode_buffer(ip, &bh);
-        if (error) {
-                mutex_unlock(&sdp->sd_inum_mutex);
-                gfs2_trans_end(sdp);
-                return error;
-        }
-        gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode));
-        if (ir.ir_length) {
-                *formal_ino = ir.ir_start++;
-                ir.ir_length--;
-                gfs2_trans_add_bh(ip->i_gl, bh, 1);
-                gfs2_inum_range_out(&ir,
-                                    bh->b_data + sizeof(struct gfs2_dinode));
-                brelse(bh);
-                mutex_unlock(&sdp->sd_inum_mutex);
-                gfs2_trans_end(sdp);
-                return 0;
-        }
-        brelse(bh);
-        mutex_unlock(&sdp->sd_inum_mutex);
-        gfs2_trans_end(sdp);
-        return 1;
-}
-static int pick_formal_ino_2(struct gfs2_sbd *sdp, u64 *formal_ino)
-{
-        struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode);
-        struct gfs2_inode *m_ip = GFS2_I(sdp->sd_inum_inode);
-        struct gfs2_holder gh;
-        struct buffer_head *bh;
-        struct gfs2_inum_range_host ir;
-        int error;
-        error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
-        if (error)
-                return error;
-        error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0);
-        if (error)
-                goto out;
-        mutex_lock(&sdp->sd_inum_mutex);
-        error = gfs2_meta_inode_buffer(ip, &bh);
-        if (error)
-                goto out_end_trans;
-        gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode));
-        if (!ir.ir_length) {
-                struct buffer_head *m_bh;
-                u64 x, y;
-                __be64 z;
-                error = gfs2_meta_inode_buffer(m_ip, &m_bh);
-                if (error)
-                        goto out_brelse;
-                z = *(__be64 *)(m_bh->b_data + sizeof(struct gfs2_dinode));
-                x = y = be64_to_cpu(z);
-                ir.ir_start = x;
-                ir.ir_length = GFS2_INUM_QUANTUM;
-                x += GFS2_INUM_QUANTUM;
-                if (x < y)
-                        gfs2_consist_inode(m_ip);
-                z = cpu_to_be64(x);
-                gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
-                *(__be64 *)(m_bh->b_data + sizeof(struct gfs2_dinode)) = z;
-                brelse(m_bh);
-        }
-        *formal_ino = ir.ir_start++;
-        ir.ir_length--;
-        gfs2_trans_add_bh(ip->i_gl, bh, 1);
-        gfs2_inum_range_out(&ir, bh->b_data + sizeof(struct gfs2_dinode));
-out_brelse:
-        brelse(bh);
-out_end_trans:
-        mutex_unlock(&sdp->sd_inum_mutex);
-        gfs2_trans_end(sdp);
-out:
-        gfs2_glock_dq_uninit(&gh);
-        return error;
-}
-static int pick_formal_ino(struct gfs2_sbd *sdp, u64 *inum)
-{
-        int error;
-        error = pick_formal_ino_1(sdp, inum);
-        if (error <= 0)
-                return error;
-        error = pick_formal_ino_2(sdp, inum);
-        return error;
-}
 /**
 * create_ok - OK to create a new on-disk inode here?
 * @dip:  Directory in which dinode is to be created
@@ -731,7 +598,7 @@ static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation)
        if (error)
                goto out_ipreserv;
-        *no_addr = gfs2_alloc_di(dip, generation);
+        error = gfs2_alloc_di(dip, no_addr, generation);
        gfs2_trans_end(sdp);
@@ -924,7 +791,6 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip)
        size_t len;
        void *value;
        char *name;
-        struct gfs2_ea_request er;
        err = security_inode_init_security(&ip->i_inode, &dip->i_inode,
                                           &name, &value, &len);
@@ -935,16 +801,7 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip)
                return err;
        }
-        memset(&er, 0, sizeof(struct gfs2_ea_request));
+        err = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SECURITY, name, value, len, 0);
-        er.er_type = GFS2_EATYPE_SECURITY;
-        er.er_name = name;
-        er.er_data = value;
-        er.er_name_len = strlen(name);
-        er.er_data_len = len;
-        err = gfs2_ea_set_i(ip, &er);
        kfree(value);
        kfree(name);
@@ -991,13 +848,10 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
        if (error)
                goto fail_gunlock;
-        error = pick_formal_ino(sdp, &inum.no_formal_ino);
-        if (error)
-                goto fail_gunlock;
        error = alloc_dinode(dip, &inum.no_addr, &generation);
        if (error)
                goto fail_gunlock;
+        inum.no_formal_ino = generation;
        error = gfs2_glock_nq_num(sdp, inum.no_addr, &gfs2_inode_glops,
                                  LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1);
@@ -1008,9 +862,8 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
        if (error)
                goto fail_gunlock2;
-        inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode),
+        inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), inum.no_addr,
-                                        inum.no_addr,
+                                  inum.no_formal_ino, 0);
-                                        inum.no_formal_ino, 0);
        if (IS_ERR(inode))
                goto fail_gunlock2;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 7bc3c45cd676..52fb6c048981 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -84,7 +84,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
        gfs2_tune_init(&sdp->sd_tune);
-        mutex_init(&sdp->sd_inum_mutex);
        spin_lock_init(&sdp->sd_statfs_spin);
        spin_lock_init(&sdp->sd_rindex_spin);
@@ -833,21 +832,12 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
        if (error)
                goto fail;
-        /* Read in the master inode number inode */
-        sdp->sd_inum_inode = gfs2_lookup_simple(master, "inum");
-        if (IS_ERR(sdp->sd_inum_inode)) {
-                error = PTR_ERR(sdp->sd_inum_inode);
-                fs_err(sdp, "can't read in inum inode: %d\n", error);
-                goto fail_journal;
-        }
        /* Read in the master statfs inode */
        sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs");
        if (IS_ERR(sdp->sd_statfs_inode)) {
                error = PTR_ERR(sdp->sd_statfs_inode);
                fs_err(sdp, "can't read in statfs inode: %d\n", error);
-                goto fail_inum;
+                goto fail_journal;
        }
        /* Read in the resource index inode */
@@ -876,8 +866,6 @@ fail_rindex:
        iput(sdp->sd_rindex);
 fail_statfs:
        iput(sdp->sd_statfs_inode);
-fail_inum:
-        iput(sdp->sd_inum_inode);
 fail_journal:
        init_journal(sdp, UNDO);
 fail:
@@ -905,20 +893,12 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
                return error;
        }
-        sprintf(buf, "inum_range%u", sdp->sd_jdesc->jd_jid);
-        sdp->sd_ir_inode = gfs2_lookup_simple(pn, buf);
-        if (IS_ERR(sdp->sd_ir_inode)) {
-                error = PTR_ERR(sdp->sd_ir_inode);
-                fs_err(sdp, "can't find local \"ir\" file: %d\n", error);
-                goto fail;
-        }
        sprintf(buf, "statfs_change%u", sdp->sd_jdesc->jd_jid);
        sdp->sd_sc_inode = gfs2_lookup_simple(pn, buf);
        if (IS_ERR(sdp->sd_sc_inode)) {
                error = PTR_ERR(sdp->sd_sc_inode);
                fs_err(sdp, "can't find local \"sc\" file: %d\n", error);
-                goto fail_ir_i;
+                goto fail;
        }
        sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid);
@@ -932,27 +912,16 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
        iput(pn);
        pn = NULL;
-        ip = GFS2_I(sdp->sd_ir_inode);
-        error = gfs2_glock_nq_init(ip->i_gl,
-                                   LM_ST_EXCLUSIVE, 0,
-                                   &sdp->sd_ir_gh);
-        if (error) {
-                fs_err(sdp, "can't lock local \"ir\" file: %d\n", error);
-                goto fail_qc_i;
-        }
        ip = GFS2_I(sdp->sd_sc_inode);
-        error = gfs2_glock_nq_init(ip->i_gl,
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0,
-                                   LM_ST_EXCLUSIVE, 0,
                                   &sdp->sd_sc_gh);
        if (error) {
                fs_err(sdp, "can't lock local \"sc\" file: %d\n", error);
-                goto fail_ir_gh;
+                goto fail_qc_i;
        }
        ip = GFS2_I(sdp->sd_qc_inode);
-        error = gfs2_glock_nq_init(ip->i_gl,
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0,
-                                   LM_ST_EXCLUSIVE, 0,
                                   &sdp->sd_qc_gh);
        if (error) {
                fs_err(sdp, "can't lock local \"qc\" file: %d\n", error);
@@ -965,14 +934,10 @@ fail_qc_gh:
        gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
 fail_ut_gh:
        gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
-fail_ir_gh:
-        gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
 fail_qc_i:
        iput(sdp->sd_qc_inode);
 fail_ut_i:
        iput(sdp->sd_sc_inode);
-fail_ir_i:
-        iput(sdp->sd_ir_inode);
 fail:
        if (pn)
                iput(pn);
@@ -1063,7 +1028,6 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
        ls->ls_ops = lm;
        ls->ls_first = 1;
-        ls->ls_id = 0;
        for (options = args->ar_hostdata; (o = strsep(&options, ":")); ) {
                substring_t tmp[MAX_OPT_ARGS];
@@ -1081,10 +1045,7 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
                        ls->ls_jid = option;
                        break;
                case Opt_id:
-                        ret = match_int(&tmp[0], &option);
+                        /* Obsolete, but left for backward compat purposes */
-                        if (ret)
-                                goto hostdata_error;
-                        ls->ls_id = option;
                        break;
                case Opt_first:
                        ret = match_int(&tmp[0], &option);
@@ -1133,6 +1094,17 @@ void gfs2_lm_unmount(struct gfs2_sbd *sdp)
                lm->lm_unmount(sdp);
 }
+void gfs2_online_uevent(struct gfs2_sbd *sdp)
+{
+        struct super_block *sb = sdp->sd_vfs;
+        char ro[20];
+        char spectator[20];
+        char *envp[] = { ro, spectator, NULL };
+        sprintf(ro, "RDONLY=%d", (sb->s_flags & MS_RDONLY) ? 1 : 0);
+        sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0);
+        kobject_uevent_env(&sdp->sd_kobj, KOBJ_ONLINE, envp);
+}
 /**
 * fill_super - Read in superblock
 * @sb: The VFS superblock
@@ -1157,6 +1129,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
        sdp->sd_args.ar_quota = GFS2_QUOTA_DEFAULT;
        sdp->sd_args.ar_data = GFS2_DATA_DEFAULT;
        sdp->sd_args.ar_commit = 60;
+        sdp->sd_args.ar_errors = GFS2_ERRORS_DEFAULT;
        error = gfs2_mount_args(sdp, &sdp->sd_args, data);
        if (error) {
@@ -1174,6 +1147,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
        sb->s_magic = GFS2_MAGIC;
        sb->s_op = &gfs2_super_ops;
        sb->s_export_op = &gfs2_export_ops;
+        sb->s_xattr = gfs2_xattr_handlers;
        sb->s_time_gran = 1;
        sb->s_maxbytes = MAX_LFS_FILESIZE;
@@ -1236,7 +1210,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
        }
        gfs2_glock_dq_uninit(&mount_gh);
+        gfs2_online_uevent(sdp);
        return 0;
 fail_threads:
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index f8bd20baf99c..c3ac18054057 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -26,8 +26,7 @@
 #include "acl.h"
 #include "bmap.h"
 #include "dir.h"
-#include "eaops.h"
+#include "xattr.h"
-#include "eattr.h"
 #include "glock.h"
 #include "inode.h"
 #include "meta_io.h"
@@ -349,7 +348,7 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
        error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0);
        if (error)
-                goto out_rgrp;
+                goto out_gunlock;
        error = gfs2_dir_del(dip, &dentry->d_name);
        if (error)
@@ -1302,60 +1301,53 @@ static int gfs2_setxattr(struct dentry *dentry, const char *name,
                         const void *data, size_t size, int flags)
 {
        struct inode *inode = dentry->d_inode;
-        struct gfs2_ea_request er;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_holder gh;
-        memset(&er, 0, sizeof(struct gfs2_ea_request));
+        int ret;
-        er.er_type = gfs2_ea_name2type(name, &er.er_name);
-        if (er.er_type == GFS2_EATYPE_UNUSED)
-                return -EOPNOTSUPP;
-        er.er_data = (char *)data;
-        er.er_name_len = strlen(er.er_name);
-        er.er_data_len = size;
-        er.er_flags = flags;
-        gfs2_assert_warn(GFS2_SB(inode), !(er.er_flags & GFS2_ERF_MODE));
-        return gfs2_ea_set(GFS2_I(inode), &er);
+        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+        ret = gfs2_glock_nq(&gh);
+        if (ret == 0) {
+                ret = generic_setxattr(dentry, name, data, size, flags);
+                gfs2_glock_dq(&gh);
+        }
+        gfs2_holder_uninit(&gh);
+        return ret;
 }
 static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name,
                             void *data, size_t size)
 {
-        struct gfs2_ea_request er;
+        struct inode *inode = dentry->d_inode;
+        struct gfs2_inode *ip = GFS2_I(inode);
-        memset(&er, 0, sizeof(struct gfs2_ea_request));
+        struct gfs2_holder gh;
-        er.er_type = gfs2_ea_name2type(name, &er.er_name);
+        int ret;
-        if (er.er_type == GFS2_EATYPE_UNUSED)
-                return -EOPNOTSUPP;
-        er.er_data = data;
-        er.er_name_len = strlen(er.er_name);
-        er.er_data_len = size;
-        return gfs2_ea_get(GFS2_I(dentry->d_inode), &er);
-}
-static ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
-{
-        struct gfs2_ea_request er;
-        memset(&er, 0, sizeof(struct gfs2_ea_request));
-        er.er_data = (size) ? buffer : NULL;
-        er.er_data_len = size;
-        return gfs2_ea_list(GFS2_I(dentry->d_inode), &er);
+        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
+        ret = gfs2_glock_nq(&gh);
+        if (ret == 0) {
+                ret = generic_getxattr(dentry, name, data, size);
+                gfs2_glock_dq(&gh);
+        }
+        gfs2_holder_uninit(&gh);
+        return ret;
 }
 static int gfs2_removexattr(struct dentry *dentry, const char *name)
 {
-        struct gfs2_ea_request er;
+        struct inode *inode = dentry->d_inode;
+        struct gfs2_inode *ip = GFS2_I(inode);
-        memset(&er, 0, sizeof(struct gfs2_ea_request));
+        struct gfs2_holder gh;
-        er.er_type = gfs2_ea_name2type(name, &er.er_name);
+        int ret;
-        if (er.er_type == GFS2_EATYPE_UNUSED)
-                return -EOPNOTSUPP;
-        er.er_name_len = strlen(er.er_name);
-        return gfs2_ea_remove(GFS2_I(dentry->d_inode), &er);
+        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+        ret = gfs2_glock_nq(&gh);
+        if (ret == 0) {
+                ret = generic_removexattr(dentry, name);
+                gfs2_glock_dq(&gh);
+        }
+        gfs2_holder_uninit(&gh);
+        return ret;
 }
 static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index fba795798d3a..28c590b7c9da 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -857,7 +857,8 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
                                        goto start_new_extent;
                                if ((start + nr_sects) != blk) {
                                        rv = blkdev_issue_discard(bdev, start,
-                                                            nr_sects, GFP_NOFS);
+                                                            nr_sects, GFP_NOFS,
+                                                            DISCARD_FL_BARRIER);
                                        if (rv)
                                                goto fail;
                                        nr_sects = 0;
@@ -871,7 +872,8 @@ start_new_extent:
                }
        }
        if (nr_sects) {
-                rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS);
+                rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS,
+                                         DISCARD_FL_BARRIER);
                if (rv)
                        goto fail;
        }
@@ -1256,7 +1258,7 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
 * Returns: The block type (GFS2_BLKST_*)
 */
-unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block)
+static unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block)
 {
        struct gfs2_bitmap *bi = NULL;
        u32 length, rgrp_block, buf_block;
@@ -1459,6 +1461,16 @@ int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
        return 0;
 }
+static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
+{
+        struct gfs2_sbd *sdp = rgd->rd_sbd;
+        fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
+                (unsigned long long)rgd->rd_addr);
+        fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
+        gfs2_rgrp_dump(NULL, rgd->rd_gl);
+        rgd->rd_flags |= GFS2_RDF_ERROR;
+}
 /**
 * gfs2_alloc_block - Allocate one or more blocks
 * @ip: the inode to allocate the block for
@@ -1520,22 +1532,20 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
        return 0;
 rgrp_error:
-        fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
+        gfs2_rgrp_error(rgd);
-                (unsigned long long)rgd->rd_addr);
-        fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
-        gfs2_rgrp_dump(NULL, rgd->rd_gl);
-        rgd->rd_flags |= GFS2_RDF_ERROR;
        return -EIO;
 }
 /**
 * gfs2_alloc_di - Allocate a dinode
 * @dip: the directory that the inode is going in
+ * @bn: the block number which is allocated
+ * @generation: the generation number of the inode
 *
- * Returns: the block allocated
+ * Returns: 0 on success or error
 */
-u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
+int gfs2_alloc_di(struct gfs2_inode *dip, u64 *bn, u64 *generation)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
        struct gfs2_alloc *al = dip->i_alloc;
@@ -1546,16 +1556,21 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
        blk = rgblk_search(rgd, rgd->rd_last_alloc,
                           GFS2_BLKST_FREE, GFS2_BLKST_DINODE, &n);
-        BUG_ON(blk == BFITNOENT);
-        rgd->rd_last_alloc = blk;
+        /* Since all blocks are reserved in advance, this shouldn't happen */
+        if (blk == BFITNOENT)
+                goto rgrp_error;
+        rgd->rd_last_alloc = blk;
        block = rgd->rd_data0 + blk;
+        if (rgd->rd_free == 0)
+                goto rgrp_error;
-        gfs2_assert_withdraw(sdp, rgd->rd_free);
        rgd->rd_free--;
        rgd->rd_dinodes++;
        *generation = rgd->rd_igeneration++;
+        if (*generation == 0)
+                *generation = rgd->rd_igeneration++;
        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
@@ -1568,7 +1583,12 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
        rgd->rd_free_clone--;
        spin_unlock(&sdp->sd_rindex_spin);
        trace_gfs2_block_alloc(dip, block, 1, GFS2_BLKST_DINODE);
-        return block;
+        *bn = block;
+        return 0;
+rgrp_error:
+        gfs2_rgrp_error(rgd);
+        return -EIO;
 }
 /**
@@ -1676,6 +1696,46 @@ void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
 }
 /**
+ * gfs2_check_blk_type - Check the type of a block
+ * @sdp: The superblock
+ * @no_addr: The block number to check
+ * @type: The block type we are looking for
+ *
+ * Returns: 0 if the block type matches the expected type
+ *          -ESTALE if it doesn't match
+ *          or -ve errno if something went wrong while checking
+ */
+int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
+{
+        struct gfs2_rgrpd *rgd;
+        struct gfs2_holder ri_gh, rgd_gh;
+        int error;
+        error = gfs2_rindex_hold(sdp, &ri_gh);
+        if (error)
+                goto fail;
+        error = -EINVAL;
+        rgd = gfs2_blk2rgrpd(sdp, no_addr);
+        if (!rgd)
+                goto fail_rindex;
+        error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
+        if (error)
+                goto fail_rindex;
+        if (gfs2_get_block_type(rgd, no_addr) != type)
+                error = -ESTALE;
+        gfs2_glock_dq_uninit(&rgd_gh);
+fail_rindex:
+        gfs2_glock_dq_uninit(&ri_gh);
+fail:
+        return error;
+}
+/**
 * gfs2_rlist_add - add a RG to a list of RGs
 * @sdp: the filesystem
 * @rlist: the list of resource groups
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 1e76ff0f3e00..b4106ddaaa98 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -44,15 +44,15 @@ gfs2_inplace_reserve_i((ip), __FILE__, __LINE__)
 extern void gfs2_inplace_release(struct gfs2_inode *ip);
-extern unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block);
 extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
-extern u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation);
+extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation);
 extern void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
 extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
 extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
 extern void gfs2_unlink_di(struct inode *inode);
+extern int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr,
+                               unsigned int type);
 struct gfs2_rgrp_list {
        unsigned int rl_rgrps;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index f522bb017973..0ec3ec672de1 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -38,7 +38,7 @@
 #include "trans.h"
 #include "util.h"
 #include "sys.h"
-#include "eattr.h"
+#include "xattr.h"
 #define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x)
@@ -68,6 +68,8 @@ enum {
        Opt_discard,
        Opt_nodiscard,
        Opt_commit,
+        Opt_err_withdraw,
+        Opt_err_panic,
        Opt_error,
 };
@@ -97,6 +99,8 @@ static const match_table_t tokens = {
        {Opt_discard, "discard"},
        {Opt_nodiscard, "nodiscard"},
        {Opt_commit, "commit=%d"},
+        {Opt_err_withdraw, "errors=withdraw"},
+        {Opt_err_panic, "errors=panic"},
        {Opt_error, NULL}
 };
@@ -152,6 +156,11 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
                        args->ar_localcaching = 1;
                        break;
                case Opt_debug:
+                        if (args->ar_errors == GFS2_ERRORS_PANIC) {
+                                fs_info(sdp, "-o debug and -o errors=panic "
+                                       "are mutually exclusive.\n");
+                                return -EINVAL;
+                        }
                        args->ar_debug = 1;
                        break;
                case Opt_nodebug:
@@ -205,6 +214,17 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
                                return rv ? rv : -EINVAL;
                        }
                        break;
+                case Opt_err_withdraw:
+                        args->ar_errors = GFS2_ERRORS_WITHDRAW;
+                        break;
+                case Opt_err_panic:
+                        if (args->ar_debug) {
+                                fs_info(sdp, "-o debug and -o errors=panic "
+                                        "are mutually exclusive.\n");
+                                return -EINVAL;
+                        }
+                        args->ar_errors = GFS2_ERRORS_PANIC;
+                        break;
                case Opt_error:
                default:
                        fs_info(sdp, "invalid mount option: %s\n", o);
@@ -768,7 +788,6 @@ restart:
        /*  Release stuff  */
        iput(sdp->sd_jindex);
-        iput(sdp->sd_inum_inode);
        iput(sdp->sd_statfs_inode);
        iput(sdp->sd_rindex);
        iput(sdp->sd_quota_inode);
@@ -779,10 +798,8 @@ restart:
        if (!sdp->sd_args.ar_spectator) {
                gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
                gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
-                gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
                gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
                gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
-                iput(sdp->sd_ir_inode);
                iput(sdp->sd_sc_inode);
                iput(sdp->sd_qc_inode);
        }
@@ -1084,6 +1101,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
        gt->gt_log_flush_secs = args.ar_commit;
        spin_unlock(&gt->gt_spin);
+        gfs2_online_uevent(sdp);
        return 0;
 }
@@ -1225,6 +1243,22 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        lfsecs = sdp->sd_tune.gt_log_flush_secs;
        if (lfsecs != 60)
                seq_printf(s, ",commit=%d", lfsecs);
+        if (args->ar_errors != GFS2_ERRORS_DEFAULT) {
+                const char *state;
+                switch (args->ar_errors) {
+                case GFS2_ERRORS_WITHDRAW:
+                        state = "withdraw";
+                        break;
+                case GFS2_ERRORS_PANIC:
+                        state = "panic";
+                        break;
+                default:
+                        state = "unknown";
+                        break;
+                }
+                seq_printf(s, ",errors=%s", state);
+        }
        return 0;
 }
@@ -1252,6 +1286,10 @@ static void gfs2_delete_inode(struct inode *inode)
                goto out;
        }
+        error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED);
+        if (error)
+                goto out_truncate;
        gfs2_glock_dq_wait(&ip->i_iopen_gh);
        gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
        error = gfs2_glock_nq(&ip->i_iopen_gh);
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 22e0417ed996..235db3682885 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -25,7 +25,7 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
        return x;
 }
-void gfs2_jindex_free(struct gfs2_sbd *sdp);
+extern void gfs2_jindex_free(struct gfs2_sbd *sdp);
 extern int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *data);
@@ -36,7 +36,7 @@ extern int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
                                     struct gfs2_inode **ipp);
 extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
+extern void gfs2_online_uevent(struct gfs2_sbd *sdp);
 extern int gfs2_statfs_init(struct gfs2_sbd *sdp);
 extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
                               s64 dinodes);
@@ -54,6 +54,7 @@ extern struct file_system_type gfs2meta_fs_type;
 extern const struct export_operations gfs2_export_ops;
 extern const struct super_operations gfs2_super_ops;
 extern const struct dentry_operations gfs2_dops;
+extern struct xattr_handler *gfs2_xattr_handlers[];
 #endif /* __SUPER_DOT_H__ */
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index a7cbfbd340c7..446329728d52 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -16,6 +16,7 @@
 #include <linux/kobject.h>
 #include <asm/uaccess.h>
 #include <linux/gfs2_ondisk.h>
+#include <linux/genhd.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -319,12 +320,6 @@ static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
        return ret;
 }
-static ssize_t lkid_show(struct gfs2_sbd *sdp, char *buf)
-{
-        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
-        return sprintf(buf, "%u\n", ls->ls_id);
-}
 static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf)
 {
        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
@@ -389,7 +384,6 @@ static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
 GDLM_ATTR(proto_name,           0444, proto_name_show,          NULL);
 GDLM_ATTR(block,                0644, block_show,               block_store);
 GDLM_ATTR(withdraw,             0644, withdraw_show,            withdraw_store);
-GDLM_ATTR(id,                   0444, lkid_show,                NULL);
 GDLM_ATTR(jid,                  0444, jid_show,                 NULL);
 GDLM_ATTR(first,                0444, lkfirst_show,             NULL);
 GDLM_ATTR(first_done,           0444, first_done_show,          NULL);
@@ -401,7 +395,6 @@ static struct attribute *lock_module_attrs[] = {
        &gdlm_attr_proto_name.attr,
        &gdlm_attr_block.attr,
        &gdlm_attr_withdraw.attr,
-        &gdlm_attr_id.attr,
        &gdlm_attr_jid.attr,
        &gdlm_attr_first.attr,
        &gdlm_attr_first_done.attr,
@@ -519,7 +512,14 @@ static struct attribute_group lock_module_group = {
 int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
 {
+        struct super_block *sb = sdp->sd_vfs;
        int error;
+        char ro[20];
+        char spectator[20];
+        char *envp[] = { ro, spectator, NULL };
+        sprintf(ro, "RDONLY=%d", (sb->s_flags & MS_RDONLY) ? 1 : 0);
+        sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0);
        sdp->sd_kobj.kset = gfs2_kset;
        error = kobject_init_and_add(&sdp->sd_kobj, &gfs2_ktype, NULL,
@@ -535,9 +535,17 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
        if (error)
                goto fail_tune;
-        kobject_uevent(&sdp->sd_kobj, KOBJ_ADD);
+        error = sysfs_create_link(&sdp->sd_kobj,
+                                  &disk_to_dev(sb->s_bdev->bd_disk)->kobj,
+                                  "device");
+        if (error)
+                goto fail_lock_module;
+        kobject_uevent_env(&sdp->sd_kobj, KOBJ_ADD, envp);
        return 0;
+fail_lock_module:
+        sysfs_remove_group(&sdp->sd_kobj, &lock_module_group);
 fail_tune:
        sysfs_remove_group(&sdp->sd_kobj, &tune_group);
 fail_reg:
@@ -549,12 +557,12 @@ fail:
 void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
 {
+        sysfs_remove_link(&sdp->sd_kobj, "device");
        sysfs_remove_group(&sdp->sd_kobj, &tune_group);
        sysfs_remove_group(&sdp->sd_kobj, &lock_module_group);
        kobject_put(&sdp->sd_kobj);
 }
 static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
                       struct kobj_uevent_env *env)
 {
@@ -563,6 +571,8 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
        add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
        add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
+        if (!sdp->sd_args.ar_spectator)
+                add_uevent_var(env, "JOURNALID=%u", sdp->sd_lockstruct.ls_jid);
        if (gfs2_uuid_valid(uuid)) {
                add_uevent_var(env, "UUID=%02X%02X%02X%02X-%02X%02X-%02X%02X-"
                               "%02X%02X-%02X%02X%02X%02X%02X%02X",
@@ -578,7 +588,6 @@ static struct kset_uevent_ops gfs2_uevent_ops = {
        .uevent = gfs2_uevent,
 };
 int gfs2_sys_init(void)
 {
        gfs2_kset = kset_create_and_add("gfs2", &gfs2_uevent_ops, fs_kobj);
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 9d12b1118ba0..f6a7efa34eb9 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -38,24 +38,30 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
        const struct lm_lockops *lm = ls->ls_ops;
        va_list args;
-        if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+        if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW &&
+            test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
                return 0;
        va_start(args, fmt);
        vprintk(fmt, args);
        va_end(args);
-        fs_err(sdp, "about to withdraw this file system\n");
+        if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) {
-        BUG_ON(sdp->sd_args.ar_debug);
+                fs_err(sdp, "about to withdraw this file system\n");
+                BUG_ON(sdp->sd_args.ar_debug);
-        kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE);
+                kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE);
-        if (lm->lm_unmount) {
+                if (lm->lm_unmount) {
-                fs_err(sdp, "telling LM to unmount\n");
+                        fs_err(sdp, "telling LM to unmount\n");
-                lm->lm_unmount(sdp);
+                        lm->lm_unmount(sdp);
+                }
+                fs_err(sdp, "withdrawn\n");
+                dump_stack();
        }
-        fs_err(sdp, "withdrawn\n");
-        dump_stack();
+        if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC)
+                panic("GFS2: fsid=%s: panic requested.\n", sdp->sd_fsname);
        return -1;
 }
@@ -93,17 +99,24 @@ int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
                        gfs2_tune_get(sdp, gt_complain_secs) * HZ))
                return -2;
-        printk(KERN_WARNING
+        if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW)
-               "GFS2: fsid=%s: warning: assertion \"%s\" failed\n"
+                printk(KERN_WARNING
-               "GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+                       "GFS2: fsid=%s: warning: assertion \"%s\" failed\n"
-               sdp->sd_fsname, assertion,
+                       "GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
-               sdp->sd_fsname, function, file, line);
+                       sdp->sd_fsname, assertion,
+                       sdp->sd_fsname, function, file, line);
        if (sdp->sd_args.ar_debug)
                BUG();
        else
                dump_stack();
+        if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC)
+                panic("GFS2: fsid=%s: warning: assertion \"%s\" failed\n"
+                      "GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+                      sdp->sd_fsname, assertion,
+                      sdp->sd_fsname, function, file, line);
        sdp->sd_last_warning = jiffies;
        return -1;
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/xattr.c
index 07ea9529adda..8a0f8ef6ee27 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/xattr.c
@@ -18,8 +18,7 @@
 #include "gfs2.h"
 #include "incore.h"
 #include "acl.h"
-#include "eaops.h"
+#include "xattr.h"
-#include "eattr.h"
 #include "glock.h"
 #include "inode.h"
 #include "meta_io.h"
@@ -38,26 +37,32 @@
 * Returns: 1 if the EA should be stuffed
 */
-static int ea_calc_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er,
+static int ea_calc_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize,
                        unsigned int *size)
 {
-        *size = GFS2_EAREQ_SIZE_STUFFED(er);
+        unsigned int jbsize = sdp->sd_jbsize;
-        if (*size <= sdp->sd_jbsize)
+        /* Stuffed */
+        *size = ALIGN(sizeof(struct gfs2_ea_header) + nsize + dsize, 8);
+        if (*size <= jbsize)
                return 1;
-        *size = GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er);
+        /* Unstuffed */
+        *size = ALIGN(sizeof(struct gfs2_ea_header) + nsize +
+                      (sizeof(__be64) * DIV_ROUND_UP(dsize, jbsize)), 8);
        return 0;
 }
-static int ea_check_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er)
+static int ea_check_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize)
 {
        unsigned int size;
-        if (er->er_data_len > GFS2_EA_MAX_DATA_LEN)
+        if (dsize > GFS2_EA_MAX_DATA_LEN)
                return -ERANGE;
-        ea_calc_size(sdp, er, &size);
+        ea_calc_size(sdp, nsize, dsize, &size);
        /* This can only happen with 512 byte blocks */
        if (size > sdp->sd_jbsize)
@@ -151,7 +156,9 @@ out:
 }
 struct ea_find {
-        struct gfs2_ea_request *ef_er;
+        int type;
+        const char *name;
+        size_t namel;
        struct gfs2_ea_location *ef_el;
 };
@@ -160,14 +167,13 @@ static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
                     void *private)
 {
        struct ea_find *ef = private;
-        struct gfs2_ea_request *er = ef->ef_er;
        if (ea->ea_type == GFS2_EATYPE_UNUSED)
                return 0;
-        if (ea->ea_type == er->er_type) {
+        if (ea->ea_type == ef->type) {
-                if (ea->ea_name_len == er->er_name_len &&
+                if (ea->ea_name_len == ef->namel &&
-                    !memcmp(GFS2_EA2NAME(ea), er->er_name, ea->ea_name_len)) {
+                    !memcmp(GFS2_EA2NAME(ea), ef->name, ea->ea_name_len)) {
                        struct gfs2_ea_location *el = ef->ef_el;
                        get_bh(bh);
                        el->el_bh = bh;
@@ -180,13 +186,15 @@ static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
        return 0;
 }
-int gfs2_ea_find(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name,
                 struct gfs2_ea_location *el)
 {
        struct ea_find ef;
        int error;
-        ef.ef_er = er;
+        ef.type = type;
+        ef.name = name;
+        ef.namel = strlen(name);
        ef.ef_el = el;
        memset(el, 0, sizeof(struct gfs2_ea_location));
@@ -344,6 +352,20 @@ struct ea_list {
        unsigned int ei_size;
 };
+static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea)
+{
+        switch (ea->ea_type) {
+        case GFS2_EATYPE_USR:
+                return 5 + ea->ea_name_len + 1;
+        case GFS2_EATYPE_SYS:
+                return 7 + ea->ea_name_len + 1;
+        case GFS2_EATYPE_SECURITY:
+                return 9 + ea->ea_name_len + 1;
+        default:
+                return 0;
+        }
+}
 static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh,
                     struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
                     void *private)
@@ -392,21 +414,25 @@ static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh,
 }
 /**
- * gfs2_ea_list -
+ * gfs2_listxattr - List gfs2 extended attributes
- * @ip:
+ * @dentry: The dentry whose inode we are interested in
- * @er:
+ * @buffer: The buffer to write the results
+ * @size: The size of the buffer
 *
 * Returns: actual size of data on success, -errno on error
 */
-int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
 {
+        struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+        struct gfs2_ea_request er;
        struct gfs2_holder i_gh;
        int error;
-        if (!er->er_data || !er->er_data_len) {
+        memset(&er, 0, sizeof(struct gfs2_ea_request));
-                er->er_data = NULL;
+        if (size) {
-                er->er_data_len = 0;
+                er.er_data = buffer;
+                er.er_data_len = size;
        }
        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
@@ -414,7 +440,7 @@ int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er)
                return error;
        if (ip->i_eattr) {
-                struct ea_list ei = { .ei_er = er, .ei_size = 0 };
+                struct ea_list ei = { .ei_er = &er, .ei_size = 0 };
                error = ea_foreach(ip, ea_list_i, &ei);
                if (!error)
@@ -491,84 +517,61 @@ out:
 }
 int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
-                     char *data)
+                     char *data, size_t size)
 {
+        int ret;
+        size_t len = GFS2_EA_DATA_LEN(el->el_ea);
+        if (len > size)
+                return -ERANGE;
        if (GFS2_EA_IS_STUFFED(el->el_ea)) {
-                memcpy(data, GFS2_EA2DATA(el->el_ea), GFS2_EA_DATA_LEN(el->el_ea));
+                memcpy(data, GFS2_EA2DATA(el->el_ea), len);
-                return 0;
+                return len;
-        } else
+        }
-                return ea_get_unstuffed(ip, el->el_ea, data);
+        ret = ea_get_unstuffed(ip, el->el_ea, data);
+        if (ret < 0)
+                return ret;
+        return len;
 }
 /**
- * gfs2_ea_get_i -
+ * gfs2_xattr_get - Get a GFS2 extended attribute
- * @ip: The GFS2 inode
+ * @inode: The inode
- * @er: The request structure
+ * @type: The type of extended attribute
+ * @name: The name of the extended attribute
+ * @buffer: The buffer to write the result into
+ * @size: The size of the buffer
 *
 * Returns: actual size of data on success, -errno on error
 */
-int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+int gfs2_xattr_get(struct inode *inode, int type, const char *name,
+                   void *buffer, size_t size)
 {
+        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_ea_location el;
        int error;
        if (!ip->i_eattr)
                return -ENODATA;
+        if (strlen(name) > GFS2_EA_MAX_NAME_LEN)
+                return -EINVAL;
-        error = gfs2_ea_find(ip, er, &el);
+        error = gfs2_ea_find(ip, type, name, &el);
        if (error)
                return error;
        if (!el.el_ea)
                return -ENODATA;
+        if (size)
-        if (er->er_data_len) {
+                error = gfs2_ea_get_copy(ip, &el, buffer, size);
-                if (GFS2_EA_DATA_LEN(el.el_ea) > er->er_data_len)
+        else
-                        error =  -ERANGE;
-                else
-                        error = gfs2_ea_get_copy(ip, &el, er->er_data);
-        }
-        if (!error)
                error = GFS2_EA_DATA_LEN(el.el_ea);
        brelse(el.el_bh);
        return error;
 }
 /**
- * gfs2_ea_get -
- * @ip: The GFS2 inode
- * @er: The request structure
- *
- * Returns: actual size of data on success, -errno on error
- */
-int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
-        struct gfs2_holder i_gh;
-        int error;
-        if (!er->er_name_len ||
-            er->er_name_len > GFS2_EA_MAX_NAME_LEN)
-                return -EINVAL;
-        if (!er->er_data || !er->er_data_len) {
-                er->er_data = NULL;
-                er->er_data_len = 0;
-        }
-        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
-        if (error)
-                return error;
-        error = gfs2_ea_ops[er->er_type]->eo_get(ip, er);
-        gfs2_glock_dq_uninit(&i_gh);
-        return error;
-}
-/**
 * ea_alloc_blk - allocates a new block for extended attributes.
 * @ip: A pointer to the inode that's getting extended attributes
 * @bhp: Pointer to pointer to a struct buffer_head
@@ -713,12 +716,6 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
        error = gfs2_meta_inode_buffer(ip, &dibh);
        if (!error) {
-                if (er->er_flags & GFS2_ERF_MODE) {
-                        gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
-                                            (ip->i_inode.i_mode & S_IFMT) ==
-                                            (er->er_mode & S_IFMT));
-                        ip->i_inode.i_mode = er->er_mode;
-                }
                ip->i_inode.i_ctime = CURRENT_TIME;
                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
                gfs2_dinode_out(ip, dibh->b_data);
@@ -762,15 +759,23 @@ static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 * Returns: errno
 */
-static int ea_init(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+static int ea_init(struct gfs2_inode *ip, int type, const char *name,
+                   const void *data, size_t size)
 {
+        struct gfs2_ea_request er;
        unsigned int jbsize = GFS2_SB(&ip->i_inode)->sd_jbsize;
        unsigned int blks = 1;
-        if (GFS2_EAREQ_SIZE_STUFFED(er) > jbsize)
+        er.er_type = type;
-                blks += DIV_ROUND_UP(er->er_data_len, jbsize);
+        er.er_name = name;
+        er.er_name_len = strlen(name);
+        er.er_data = (void *)data;
+        er.er_data_len = size;
+        if (GFS2_EAREQ_SIZE_STUFFED(&er) > jbsize)
+                blks += DIV_ROUND_UP(er.er_data_len, jbsize);
-        return ea_alloc_skeleton(ip, er, blks, ea_init_i, NULL);
+        return ea_alloc_skeleton(ip, &er, blks, ea_init_i, NULL);
 }
 static struct gfs2_ea_header *ea_split_ea(struct gfs2_ea_header *ea)
@@ -848,12 +853,6 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
        error = gfs2_meta_inode_buffer(ip, &dibh);
        if (error)
                goto out;
-        if (er->er_flags & GFS2_ERF_MODE) {
-                gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
-                        (ip->i_inode.i_mode & S_IFMT) == (er->er_mode & S_IFMT));
-                ip->i_inode.i_mode = er->er_mode;
-        }
        ip->i_inode.i_ctime = CURRENT_TIME;
        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
        gfs2_dinode_out(ip, dibh->b_data);
@@ -894,7 +893,8 @@ static int ea_set_simple(struct gfs2_inode *ip, struct buffer_head *bh,
        int stuffed;
        int error;
-        stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er, &size);
+        stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er->er_name_len,
+                               es->es_er->er_data_len, &size);
        if (ea->ea_type == GFS2_EATYPE_UNUSED) {
                if (GFS2_EA_REC_LEN(ea) < size)
@@ -1005,15 +1005,22 @@ out:
        return error;
 }
-static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+static int ea_set_i(struct gfs2_inode *ip, int type, const char *name,
-                    struct gfs2_ea_location *el)
+                    const void *value, size_t size, struct gfs2_ea_location *el)
 {
+        struct gfs2_ea_request er;
        struct ea_set es;
        unsigned int blks = 2;
        int error;
+        er.er_type = type;
+        er.er_name = name;
+        er.er_data = (void *)value;
+        er.er_name_len = strlen(name);
+        er.er_data_len = size;
        memset(&es, 0, sizeof(struct ea_set));
-        es.es_er = er;
+        es.es_er = &er;
        es.es_el = el;
        error = ea_foreach(ip, ea_set_simple, &es);
@@ -1024,10 +1031,10 @@ static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
        if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT))
                blks++;
-        if (GFS2_EAREQ_SIZE_STUFFED(er) > GFS2_SB(&ip->i_inode)->sd_jbsize)
+        if (GFS2_EAREQ_SIZE_STUFFED(&er) > GFS2_SB(&ip->i_inode)->sd_jbsize)
-                blks += DIV_ROUND_UP(er->er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize);
+                blks += DIV_ROUND_UP(er.er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize);
-        return ea_alloc_skeleton(ip, er, blks, ea_set_block, el);
+        return ea_alloc_skeleton(ip, &er, blks, ea_set_block, el);
 }
 static int ea_set_remove_unstuffed(struct gfs2_inode *ip,
@@ -1039,75 +1046,7 @@ static int ea_set_remove_unstuffed(struct gfs2_inode *ip,
                                     GFS2_EA2NEXT(el->el_prev) == el->el_ea);
        }
-        return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev,0);
+        return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev, 0);
-}
-int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
-        struct gfs2_ea_location el;
-        int error;
-        if (!ip->i_eattr) {
-                if (er->er_flags & XATTR_REPLACE)
-                        return -ENODATA;
-                return ea_init(ip, er);
-        }
-        error = gfs2_ea_find(ip, er, &el);
-        if (error)
-                return error;
-        if (el.el_ea) {
-                if (ip->i_diskflags & GFS2_DIF_APPENDONLY) {
-                        brelse(el.el_bh);
-                        return -EPERM;
-                }
-                error = -EEXIST;
-                if (!(er->er_flags & XATTR_CREATE)) {
-                        int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea);
-                        error = ea_set_i(ip, er, &el);
-                        if (!error && unstuffed)
-                                ea_set_remove_unstuffed(ip, &el);
-                }
-                brelse(el.el_bh);
-        } else {
-                error = -ENODATA;
-                if (!(er->er_flags & XATTR_REPLACE))
-                        error = ea_set_i(ip, er, NULL);
-        }
-        return error;
-}
-int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
-        struct gfs2_holder i_gh;
-        int error;
-        if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN)
-                return -EINVAL;
-        if (!er->er_data || !er->er_data_len) {
-                er->er_data = NULL;
-                er->er_data_len = 0;
-        }
-        error = ea_check_size(GFS2_SB(&ip->i_inode), er);
-        if (error)
-                return error;
-        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
-        if (error)
-                return error;
-        if (IS_IMMUTABLE(&ip->i_inode))
-                error = -EPERM;
-        else
-                error = gfs2_ea_ops[er->er_type]->eo_set(ip, er);
-        gfs2_glock_dq_uninit(&i_gh);
-        return error;
 }
 static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
@@ -1131,8 +1070,9 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
                if (GFS2_EA_IS_LAST(ea))
                        prev->ea_flags |= GFS2_EAFLAG_LAST;
-        } else
+        } else {
                ea->ea_type = GFS2_EATYPE_UNUSED;
+        }
        error = gfs2_meta_inode_buffer(ip, &dibh);
        if (!error) {
@@ -1147,15 +1087,29 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
        return error;
 }
-int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+/**
+ * gfs2_xattr_remove - Remove a GFS2 extended attribute
+ * @inode: The inode
+ * @type: The type of the extended attribute
+ * @name: The name of the extended attribute
+ *
+ * This is not called directly by the VFS since we use the (common)
+ * scheme of making a "set with NULL data" mean a remove request. Note
+ * that this is different from a set with zero length data.
+ *
+ * Returns: 0, or errno on failure
+ */
+static int gfs2_xattr_remove(struct inode *inode, int type, const char *name)
 {
+        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_ea_location el;
        int error;
        if (!ip->i_eattr)
                return -ENODATA;
-        error = gfs2_ea_find(ip, er, &el);
+        error = gfs2_ea_find(ip, type, name, &el);
        if (error)
                return error;
        if (!el.el_ea)
@@ -1164,8 +1118,7 @@ int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
        if (GFS2_EA_IS_STUFFED(el.el_ea))
                error = ea_remove_stuffed(ip, &el);
        else
-                error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev,
+                error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev, 0);
-                                            0);
        brelse(el.el_bh);
@@ -1173,31 +1126,70 @@ int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
 }
 /**
- * gfs2_ea_remove - sets (or creates or replaces) an extended attribute
+ * gfs2_xattr_set - Set (or remove) a GFS2 extended attribute
- * @ip: pointer to the inode of the target file
+ * @inode: The inode
- * @er: request information
+ * @type: The type of the extended attribute
+ * @name: The name of the extended attribute
+ * @value: The value of the extended attribute (NULL for remove)
+ * @size: The size of the @value argument
+ * @flags: Create or Replace
 *
- * Returns: errno
+ * See gfs2_xattr_remove() for details of the removal of xattrs.
+ *
+ * Returns: 0 or errno on failure
 */
-int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+int gfs2_xattr_set(struct inode *inode, int type, const char *name,
+                   const void *value, size_t size, int flags)
 {
-        struct gfs2_holder i_gh;
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_ea_location el;
+        unsigned int namel = strlen(name);
        int error;
-        if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN)
+        if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-                return -EINVAL;
+                return -EPERM;
+        if (namel > GFS2_EA_MAX_NAME_LEN)
+                return -ERANGE;
-        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+        if (value == NULL)
+                return gfs2_xattr_remove(inode, type, name);
+        if (ea_check_size(sdp, namel, size))
+                return -ERANGE;
+        if (!ip->i_eattr) {
+                if (flags & XATTR_REPLACE)
+                        return -ENODATA;
+                return ea_init(ip, type, name, value, size);
+        }
+        error = gfs2_ea_find(ip, type, name, &el);
        if (error)
                return error;
-        if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
+        if (el.el_ea) {
-                error = -EPERM;
+                if (ip->i_diskflags & GFS2_DIF_APPENDONLY) {
-        else
+                        brelse(el.el_bh);
-                error = gfs2_ea_ops[er->er_type]->eo_remove(ip, er);
+                        return -EPERM;
+                }
-        gfs2_glock_dq_uninit(&i_gh);
+                error = -EEXIST;
+                if (!(flags & XATTR_CREATE)) {
+                        int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea);
+                        error = ea_set_i(ip, type, name, value, size, &el);
+                        if (!error && unstuffed)
+                                ea_set_remove_unstuffed(ip, &el);
+                }
+                brelse(el.el_bh);
+                return error;
+        }
+        error = -ENODATA;
+        if (!(flags & XATTR_REPLACE))
+                error = ea_set_i(ip, type, name, value, size, NULL);
        return error;
 }
@@ -1503,3 +1495,64 @@ out_alloc:
        return error;
 }
+static int gfs2_xattr_user_get(struct inode *inode, const char *name,
+                               void *buffer, size_t size)
+{
+        return gfs2_xattr_get(inode, GFS2_EATYPE_USR, name, buffer, size);
+}
+static int gfs2_xattr_user_set(struct inode *inode, const char *name,
+                               const void *value, size_t size, int flags)
+{
+        return gfs2_xattr_set(inode, GFS2_EATYPE_USR, name, value, size, flags);
+}
+static int gfs2_xattr_system_get(struct inode *inode, const char *name,
+                                 void *buffer, size_t size)
+{
+        return gfs2_xattr_get(inode, GFS2_EATYPE_SYS, name, buffer, size);
+}
+static int gfs2_xattr_system_set(struct inode *inode, const char *name,
+                                 const void *value, size_t size, int flags)
+{
+        return gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, value, size, flags);
+}
+static int gfs2_xattr_security_get(struct inode *inode, const char *name,
+                                   void *buffer, size_t size)
+{
+        return gfs2_xattr_get(inode, GFS2_EATYPE_SECURITY, name, buffer, size);
+}
+static int gfs2_xattr_security_set(struct inode *inode, const char *name,
+                                   const void *value, size_t size, int flags)
+{
+        return gfs2_xattr_set(inode, GFS2_EATYPE_SECURITY, name, value, size, flags);
+}
+static struct xattr_handler gfs2_xattr_user_handler = {
+        .prefix = XATTR_USER_PREFIX,
+        .get    = gfs2_xattr_user_get,
+        .set    = gfs2_xattr_user_set,
+};
+static struct xattr_handler gfs2_xattr_security_handler = {
+        .prefix = XATTR_SECURITY_PREFIX,
+        .get    = gfs2_xattr_security_get,
+        .set    = gfs2_xattr_security_set,
+};
+static struct xattr_handler gfs2_xattr_system_handler = {
+        .prefix = XATTR_SYSTEM_PREFIX,
+        .get    = gfs2_xattr_system_get,
+        .set    = gfs2_xattr_system_set,
+};
+struct xattr_handler *gfs2_xattr_handlers[] = {
+        &gfs2_xattr_user_handler,
+        &gfs2_xattr_security_handler,
+        &gfs2_xattr_system_handler,
+        NULL,
+};
diff --git a/fs/gfs2/eattr.h b/fs/gfs2/xattr.h
index c82dbe01d713..cbdfd7743733 100644
--- a/fs/gfs2/eattr.h
+++ b/fs/gfs2/xattr.h
@@ -19,7 +19,7 @@ struct iattr;
 #define GFS2_EA_SIZE(ea) \
 ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \
      ((GFS2_EA_IS_STUFFED(ea)) ? GFS2_EA_DATA_LEN(ea) : \
-                                  (sizeof(__be64) * (ea)->ea_num_ptrs)), 8)
+                                  (sizeof(__be64) * (ea)->ea_num_ptrs)), 8)
 #define GFS2_EA_IS_STUFFED(ea) (!(ea)->ea_num_ptrs)
 #define GFS2_EA_IS_LAST(ea) ((ea)->ea_flags & GFS2_EAFLAG_LAST)
@@ -27,10 +27,6 @@ ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \
 #define GFS2_EAREQ_SIZE_STUFFED(er) \
 ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + (er)->er_data_len, 8)
-#define GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er) \
-ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + \
-      sizeof(__be64) * DIV_ROUND_UP((er)->er_data_len, (sdp)->sd_jbsize), 8)
 #define GFS2_EA2NAME(ea) ((char *)((struct gfs2_ea_header *)(ea) + 1))
 #define GFS2_EA2DATA(ea) (GFS2_EA2NAME(ea) + (ea)->ea_name_len)
@@ -43,16 +39,12 @@ ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + \
 #define GFS2_EA_BH2FIRST(bh) \
 ((struct gfs2_ea_header *)((bh)->b_data + sizeof(struct gfs2_meta_header)))
-#define GFS2_ERF_MODE 0x80000000
 struct gfs2_ea_request {
        const char *er_name;
        char *er_data;
        unsigned int er_name_len;
        unsigned int er_data_len;
        unsigned int er_type; /* GFS2_EATYPE_... */
-        int er_flags;
-        mode_t er_mode;
 };
 struct gfs2_ea_location {
@@ -61,40 +53,20 @@ struct gfs2_ea_location {
        struct gfs2_ea_header *el_prev;
 };
-int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+extern int gfs2_xattr_get(struct inode *inode, int type, const char *name,
-int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+                          void *buffer, size_t size);
-int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+extern int gfs2_xattr_set(struct inode *inode, int type, const char *name,
+                          const void *value, size_t size, int flags);
-int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+extern ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size);
-int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+extern int gfs2_ea_dealloc(struct gfs2_inode *ip);
-int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er);
-int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er);
-int gfs2_ea_dealloc(struct gfs2_inode *ip);
 /* Exported to acl.c */
-int gfs2_ea_find(struct gfs2_inode *ip,
+extern int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name,
-                 struct gfs2_ea_request *er,
+                        struct gfs2_ea_location *el);
-                 struct gfs2_ea_location *el);
+extern int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
-int gfs2_ea_get_copy(struct gfs2_inode *ip,
+                            char *data, size_t size);
-                     struct gfs2_ea_location *el,
+extern int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
-                     char *data);
+                             struct iattr *attr, char *data);
-int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
-                      struct iattr *attr, char *data);
-static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea)
-{
-        switch (ea->ea_type) {
-        case GFS2_EATYPE_USR:
-                return 5 + ea->ea_name_len + 1;
-        case GFS2_EATYPE_SYS:
-                return 7 + ea->ea_name_len + 1;
-        case GFS2_EATYPE_SECURITY:
-                return 9 + ea->ea_name_len + 1;
-        default:
-                return 0;
-        }
-}
 #endif /* __EATTR_DOT_H__ */
diff --git a/fs/inode.c b/fs/inode.c
index ae7b67e48661..b2ba83d2c4e1 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -182,9 +182,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
        if (sb->s_bdev) {
                struct backing_dev_info *bdi;
-                bdi = sb->s_bdev->bd_inode_backing_dev_info;
+                bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
-                if (!bdi)
-                        bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
                mapping->backing_dev_info = bdi;
        }
        inode->i_private = NULL;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 7b4088b2364d..0df600e9162d 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -220,7 +220,6 @@ static int journal_submit_inode_data_buffers(struct address_space *mapping)
                .nr_to_write = mapping->nrpages * 2,
                .range_start = 0,
                .range_end = i_size_read(mapping->host),
-                .for_writepages = 1,
        };
        ret = generic_writepages(mapping, &wbc);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 867f70504531..de935692d40d 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1918,6 +1918,8 @@ static inline void nfs_initialise_sb(struct super_block *sb)
        if (server->flags & NFS_MOUNT_NOAC)
                sb->s_flags |= MS_SYNCHRONOUS;
+        sb->s_bdi = &server->backing_dev_info;
        nfs_super_set_maxbytes(sb, server->maxfilesize);
 }
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 120acadc6a84..53eb26c16b50 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1490,7 +1490,6 @@ static int nfs_write_mapping(struct address_space *mapping, int how)
                .nr_to_write = LONG_MAX,
                .range_start = 0,
                .range_end = LLONG_MAX,
-                .for_writepages = 1,
        };
        return __nfs_write_mapping(mapping, &wbc, how);
diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig
index 72da095d4009..251da07b2a1d 100644
--- a/fs/nilfs2/Kconfig
+++ b/fs/nilfs2/Kconfig
@@ -1,6 +1,6 @@
 config NILFS2_FS
        tristate "NILFS2 file system support (EXPERIMENTAL)"
-        depends on BLOCK && EXPERIMENTAL
+        depends on EXPERIMENTAL
        select CRC32
        help
          NILFS2 is a log-structured file system (LFS) supporting continuous
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 99d58a028b94..08834df6ec68 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -36,6 +36,26 @@ struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap)
        return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode));
 }
+/**
+ * nilfs_bmap_lookup_at_level - find a data block or node block
+ * @bmap: bmap
+ * @key: key
+ * @level: level
+ * @ptrp: place to store the value associated to @key
+ *
+ * Description: nilfs_bmap_lookup_at_level() finds a record whose key
+ * matches @key in the block at @level of the bmap.
+ *
+ * Return Value: On success, 0 is returned and the record associated with @key
+ * is stored in the place pointed by @ptrp. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - A record associated with @key does not exist.
+ */
 int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
                               __u64 *ptrp)
 {
@@ -69,39 +89,6 @@ int nilfs_bmap_lookup_contig(struct nilfs_bmap *bmap, __u64 key, __u64 *ptrp,
        return ret;
 }
-/**
- * nilfs_bmap_lookup - find a record
- * @bmap: bmap
- * @key: key
- * @recp: pointer to record
- *
- * Description: nilfs_bmap_lookup() finds a record whose key matches @key in
- * @bmap.
- *
- * Return Value: On success, 0 is returned and the record associated with @key
- * is stored in the place pointed by @recp. On error, one of the following
- * negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- *
- * %-ENOENT - A record associated with @key does not exist.
- */
-int nilfs_bmap_lookup(struct nilfs_bmap *bmap,
-                      unsigned long key,
-                      unsigned long *recp)
-{
-        __u64 ptr;
-        int ret;
-        /* XXX: use macro for level 1 */
-        ret = nilfs_bmap_lookup_at_level(bmap, key, 1, &ptr);
-        if (recp != NULL)
-                *recp = ptr;
-        return ret;
-}
 static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
 {
        __u64 keys[NILFS_BMAP_SMALL_HIGH + 1];
@@ -469,104 +456,6 @@ __u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *bmap)
                (entries_per_group / NILFS_BMAP_GROUP_DIV);
 }
-int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *bmap,
-                                 union nilfs_bmap_ptr_req *req)
-{
-        return nilfs_dat_prepare_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
-}
-void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *bmap,
-                                 union nilfs_bmap_ptr_req *req)
-{
-        nilfs_dat_commit_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
-}
-void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *bmap,
-                              union nilfs_bmap_ptr_req *req)
-{
-        nilfs_dat_abort_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
-}
-int nilfs_bmap_start_v(struct nilfs_bmap *bmap, union nilfs_bmap_ptr_req *req,
-                       sector_t blocknr)
-{
-        struct inode *dat = nilfs_bmap_get_dat(bmap);
-        int ret;
-        ret = nilfs_dat_prepare_start(dat, &req->bpr_req);
-        if (likely(!ret))
-                nilfs_dat_commit_start(dat, &req->bpr_req, blocknr);
-        return ret;
-}
-int nilfs_bmap_prepare_end_v(struct nilfs_bmap *bmap,
-                             union nilfs_bmap_ptr_req *req)
-{
-        return nilfs_dat_prepare_end(nilfs_bmap_get_dat(bmap), &req->bpr_req);
-}
-void nilfs_bmap_commit_end_v(struct nilfs_bmap *bmap,
-                             union nilfs_bmap_ptr_req *req)
-{
-        nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req,
-                             bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
-}
-void nilfs_bmap_abort_end_v(struct nilfs_bmap *bmap,
-                            union nilfs_bmap_ptr_req *req)
-{
-        nilfs_dat_abort_end(nilfs_bmap_get_dat(bmap), &req->bpr_req);
-}
-int nilfs_bmap_move_v(const struct nilfs_bmap *bmap, __u64 vblocknr,
-                      sector_t blocknr)
-{
-        return nilfs_dat_move(nilfs_bmap_get_dat(bmap), vblocknr, blocknr);
-}
-int nilfs_bmap_mark_dirty(const struct nilfs_bmap *bmap, __u64 vblocknr)
-{
-        return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), vblocknr);
-}
-int nilfs_bmap_prepare_update_v(struct nilfs_bmap *bmap,
-                                union nilfs_bmap_ptr_req *oldreq,
-                                union nilfs_bmap_ptr_req *newreq)
-{
-        struct inode *dat = nilfs_bmap_get_dat(bmap);
-        int ret;
-        ret = nilfs_dat_prepare_end(dat, &oldreq->bpr_req);
-        if (ret < 0)
-                return ret;
-        ret = nilfs_dat_prepare_alloc(dat, &newreq->bpr_req);
-        if (ret < 0)
-                nilfs_dat_abort_end(dat, &oldreq->bpr_req);
-        return ret;
-}
-void nilfs_bmap_commit_update_v(struct nilfs_bmap *bmap,
-                                union nilfs_bmap_ptr_req *oldreq,
-                                union nilfs_bmap_ptr_req *newreq)
-{
-        struct inode *dat = nilfs_bmap_get_dat(bmap);
-        nilfs_dat_commit_end(dat, &oldreq->bpr_req,
-                             bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
-        nilfs_dat_commit_alloc(dat, &newreq->bpr_req);
-}
-void nilfs_bmap_abort_update_v(struct nilfs_bmap *bmap,
-                               union nilfs_bmap_ptr_req *oldreq,
-                               union nilfs_bmap_ptr_req *newreq)
-{
-        struct inode *dat = nilfs_bmap_get_dat(bmap);
-        nilfs_dat_abort_end(dat, &oldreq->bpr_req);
-        nilfs_dat_abort_alloc(dat, &newreq->bpr_req);
-}
 static struct lock_class_key nilfs_bmap_dat_lock_key;
 static struct lock_class_key nilfs_bmap_mdt_lock_key;
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index b2890cdcef12..9980d7dbab91 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -28,6 +28,7 @@
 #include <linux/buffer_head.h>
 #include <linux/nilfs2_fs.h>
 #include "alloc.h"
+#include "dat.h"
 #define NILFS_BMAP_INVALID_PTR  0
@@ -141,7 +142,6 @@ struct nilfs_bmap {
 int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *);
 int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *);
 void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *);
-int nilfs_bmap_lookup(struct nilfs_bmap *, unsigned long, unsigned long *);
 int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned);
 int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long);
 int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long);
@@ -160,90 +160,76 @@ void nilfs_bmap_init_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
 void nilfs_bmap_commit_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
+static inline int nilfs_bmap_lookup(struct nilfs_bmap *bmap, __u64 key,
+                                    __u64 *ptr)
+{
+        return nilfs_bmap_lookup_at_level(bmap, key, 1, ptr);
+}
 /*
 * Internal use only
 */
 struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *);
-int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *,
-                               union nilfs_bmap_ptr_req *);
-void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *,
-                               union nilfs_bmap_ptr_req *);
-void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *,
-                              union nilfs_bmap_ptr_req *);
 static inline int nilfs_bmap_prepare_alloc_ptr(struct nilfs_bmap *bmap,
-                                               union nilfs_bmap_ptr_req *req)
+                                               union nilfs_bmap_ptr_req *req,
+                                               struct inode *dat)
 {
-        if (NILFS_BMAP_USE_VBN(bmap))
+        if (dat)
-                return nilfs_bmap_prepare_alloc_v(bmap, req);
+                return nilfs_dat_prepare_alloc(dat, &req->bpr_req);
        /* ignore target ptr */
        req->bpr_ptr = bmap->b_last_allocated_ptr++;
        return 0;
 }
 static inline void nilfs_bmap_commit_alloc_ptr(struct nilfs_bmap *bmap,
-                                               union nilfs_bmap_ptr_req *req)
+                                               union nilfs_bmap_ptr_req *req,
+                                               struct inode *dat)
 {
-        if (NILFS_BMAP_USE_VBN(bmap))
+        if (dat)
-                nilfs_bmap_commit_alloc_v(bmap, req);
+                nilfs_dat_commit_alloc(dat, &req->bpr_req);
 }
 static inline void nilfs_bmap_abort_alloc_ptr(struct nilfs_bmap *bmap,
-                                              union nilfs_bmap_ptr_req *req)
+                                              union nilfs_bmap_ptr_req *req,
+                                              struct inode *dat)
 {
-        if (NILFS_BMAP_USE_VBN(bmap))
+        if (dat)
-                nilfs_bmap_abort_alloc_v(bmap, req);
+                nilfs_dat_abort_alloc(dat, &req->bpr_req);
        else
                bmap->b_last_allocated_ptr--;
 }
-int nilfs_bmap_prepare_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *);
-void nilfs_bmap_commit_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *);
-void nilfs_bmap_abort_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *);
 static inline int nilfs_bmap_prepare_end_ptr(struct nilfs_bmap *bmap,
-                                             union nilfs_bmap_ptr_req *req)
+                                             union nilfs_bmap_ptr_req *req,
+                                             struct inode *dat)
 {
-        return NILFS_BMAP_USE_VBN(bmap) ?
+        return dat ? nilfs_dat_prepare_end(dat, &req->bpr_req) : 0;
-                nilfs_bmap_prepare_end_v(bmap, req) : 0;
 }
 static inline void nilfs_bmap_commit_end_ptr(struct nilfs_bmap *bmap,
-                                             union nilfs_bmap_ptr_req *req)
+                                             union nilfs_bmap_ptr_req *req,
+                                             struct inode *dat)
 {
-        if (NILFS_BMAP_USE_VBN(bmap))
+        if (dat)
-                nilfs_bmap_commit_end_v(bmap, req);
+                nilfs_dat_commit_end(dat, &req->bpr_req,
+                                     bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
 }
 static inline void nilfs_bmap_abort_end_ptr(struct nilfs_bmap *bmap,
-                                            union nilfs_bmap_ptr_req *req)
+                                            union nilfs_bmap_ptr_req *req,
+                                            struct inode *dat)
 {
-        if (NILFS_BMAP_USE_VBN(bmap))
+        if (dat)
-                nilfs_bmap_abort_end_v(bmap, req);
+                nilfs_dat_abort_end(dat, &req->bpr_req);
 }
-int nilfs_bmap_start_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *,
-                       sector_t);
-int nilfs_bmap_move_v(const struct nilfs_bmap *, __u64, sector_t);
-int nilfs_bmap_mark_dirty(const struct nilfs_bmap *, __u64);
 __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *,
                              const struct buffer_head *);
 __u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64);
 __u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *);
-int nilfs_bmap_prepare_update_v(struct nilfs_bmap *,
-                                union nilfs_bmap_ptr_req *,
-                                union nilfs_bmap_ptr_req *);
-void nilfs_bmap_commit_update_v(struct nilfs_bmap *,
-                                union nilfs_bmap_ptr_req *,
-                                union nilfs_bmap_ptr_req *);
-void nilfs_bmap_abort_update_v(struct nilfs_bmap *,
-                               union nilfs_bmap_ptr_req *,
-                               union nilfs_bmap_ptr_req *);
 void nilfs_bmap_add_blocks(const struct nilfs_bmap *, int);
 void nilfs_bmap_sub_blocks(const struct nilfs_bmap *, int);
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index aa412724b64e..e25b507a474f 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -71,21 +71,17 @@ void nilfs_btree_path_cache_destroy(void)
        kmem_cache_destroy(nilfs_btree_path_cache);
 }
-static inline struct nilfs_btree_path *
+static inline struct nilfs_btree_path *nilfs_btree_alloc_path(void)
-nilfs_btree_alloc_path(const struct nilfs_btree *btree)
 {
-        return (struct nilfs_btree_path *)
+        return kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
-                kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
 }
-static inline void nilfs_btree_free_path(const struct nilfs_btree *btree,
+static inline void nilfs_btree_free_path(struct nilfs_btree_path *path)
-                                         struct nilfs_btree_path *path)
 {
        kmem_cache_free(nilfs_btree_path_cache, path);
 }
-static void nilfs_btree_init_path(const struct nilfs_btree *btree,
+static void nilfs_btree_init_path(struct nilfs_btree_path *path)
-                                  struct nilfs_btree_path *path)
 {
        int level;
@@ -101,26 +97,13 @@ static void nilfs_btree_init_path(const struct nilfs_btree *btree,
        }
 }
-static void nilfs_btree_clear_path(const struct nilfs_btree *btree,
+static void nilfs_btree_release_path(struct nilfs_btree_path *path)
-                                   struct nilfs_btree_path *path)
 {
        int level;
-        for (level = NILFS_BTREE_LEVEL_DATA;
+        for (level = NILFS_BTREE_LEVEL_DATA; level < NILFS_BTREE_LEVEL_MAX;
-             level < NILFS_BTREE_LEVEL_MAX;
+             level++)
-             level++) {
+                brelse(path[level].bp_bh);
-                if (path[level].bp_bh != NULL) {
-                        brelse(path[level].bp_bh);
-                        path[level].bp_bh = NULL;
-                }
-                /* sib_bh is released or deleted by prepare or commit
-                 * operations. */
-                path[level].bp_sib_bh = NULL;
-                path[level].bp_index = 0;
-                path[level].bp_oldreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
-                path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
-                path[level].bp_op = NULL;
-        }
 }
 /*
@@ -148,129 +131,110 @@ static int nilfs_btree_get_new_block(const struct nilfs_btree *btree,
 }
 static inline int
-nilfs_btree_node_get_flags(const struct nilfs_btree *btree,
+nilfs_btree_node_get_flags(const struct nilfs_btree_node *node)
-                           const struct nilfs_btree_node *node)
 {
        return node->bn_flags;
 }
 static inline void
-nilfs_btree_node_set_flags(struct nilfs_btree *btree,
+nilfs_btree_node_set_flags(struct nilfs_btree_node *node, int flags)
-                           struct nilfs_btree_node *node,
-                           int flags)
 {
        node->bn_flags = flags;
 }
-static inline int nilfs_btree_node_root(const struct nilfs_btree *btree,
+static inline int nilfs_btree_node_root(const struct nilfs_btree_node *node)
-                                        const struct nilfs_btree_node *node)
 {
-        return nilfs_btree_node_get_flags(btree, node) & NILFS_BTREE_NODE_ROOT;
+        return nilfs_btree_node_get_flags(node) & NILFS_BTREE_NODE_ROOT;
 }
 static inline int
-nilfs_btree_node_get_level(const struct nilfs_btree *btree,
+nilfs_btree_node_get_level(const struct nilfs_btree_node *node)
-                           const struct nilfs_btree_node *node)
 {
        return node->bn_level;
 }
 static inline void
-nilfs_btree_node_set_level(struct nilfs_btree *btree,
+nilfs_btree_node_set_level(struct nilfs_btree_node *node, int level)
-                           struct nilfs_btree_node *node,
-                           int level)
 {
        node->bn_level = level;
 }
 static inline int
-nilfs_btree_node_get_nchildren(const struct nilfs_btree *btree,
+nilfs_btree_node_get_nchildren(const struct nilfs_btree_node *node)
-                               const struct nilfs_btree_node *node)
 {
        return le16_to_cpu(node->bn_nchildren);
 }
 static inline void
-nilfs_btree_node_set_nchildren(struct nilfs_btree *btree,
+nilfs_btree_node_set_nchildren(struct nilfs_btree_node *node, int nchildren)
-                               struct nilfs_btree_node *node,
-                               int nchildren)
 {
        node->bn_nchildren = cpu_to_le16(nchildren);
 }
-static inline int
+static inline int nilfs_btree_node_size(const struct nilfs_btree *btree)
-nilfs_btree_node_size(const struct nilfs_btree *btree)
 {
        return 1 << btree->bt_bmap.b_inode->i_blkbits;
 }
 static inline int
-nilfs_btree_node_nchildren_min(const struct nilfs_btree *btree,
+nilfs_btree_node_nchildren_min(const struct nilfs_btree_node *node,
-                               const struct nilfs_btree_node *node)
+                               const struct nilfs_btree *btree)
 {
-        return nilfs_btree_node_root(btree, node) ?
+        return nilfs_btree_node_root(node) ?
                NILFS_BTREE_ROOT_NCHILDREN_MIN :
                NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree));
 }
 static inline int
-nilfs_btree_node_nchildren_max(const struct nilfs_btree *btree,
+nilfs_btree_node_nchildren_max(const struct nilfs_btree_node *node,
-                               const struct nilfs_btree_node *node)
+                               const struct nilfs_btree *btree)
 {
-        return nilfs_btree_node_root(btree, node) ?
+        return nilfs_btree_node_root(node) ?
                NILFS_BTREE_ROOT_NCHILDREN_MAX :
                NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(btree));
 }
 static inline __le64 *
-nilfs_btree_node_dkeys(const struct nilfs_btree *btree,
+nilfs_btree_node_dkeys(const struct nilfs_btree_node *node)
-                       const struct nilfs_btree_node *node)
 {
        return (__le64 *)((char *)(node + 1) +
-                          (nilfs_btree_node_root(btree, node) ?
+                          (nilfs_btree_node_root(node) ?
                           0 : NILFS_BTREE_NODE_EXTRA_PAD_SIZE));
 }
 static inline __le64 *
-nilfs_btree_node_dptrs(const struct nilfs_btree *btree,
+nilfs_btree_node_dptrs(const struct nilfs_btree_node *node,
-                       const struct nilfs_btree_node *node)
+                       const struct nilfs_btree *btree)
 {
-        return (__le64 *)(nilfs_btree_node_dkeys(btree, node) +
+        return (__le64 *)(nilfs_btree_node_dkeys(node) +
-                          nilfs_btree_node_nchildren_max(btree, node));
+                          nilfs_btree_node_nchildren_max(node, btree));
 }
 static inline __u64
-nilfs_btree_node_get_key(const struct nilfs_btree *btree,
+nilfs_btree_node_get_key(const struct nilfs_btree_node *node, int index)
-                         const struct nilfs_btree_node *node, int index)
 {
-        return nilfs_bmap_dkey_to_key(*(nilfs_btree_node_dkeys(btree, node) +
+        return nilfs_bmap_dkey_to_key(*(nilfs_btree_node_dkeys(node) + index));
-                                        index));
 }
 static inline void
-nilfs_btree_node_set_key(struct nilfs_btree *btree,
+nilfs_btree_node_set_key(struct nilfs_btree_node *node, int index, __u64 key)
-                         struct nilfs_btree_node *node, int index, __u64 key)
 {
-        *(nilfs_btree_node_dkeys(btree, node) + index) =
+        *(nilfs_btree_node_dkeys(node) + index) = nilfs_bmap_key_to_dkey(key);
-                nilfs_bmap_key_to_dkey(key);
 }
 static inline __u64
 nilfs_btree_node_get_ptr(const struct nilfs_btree *btree,
-                         const struct nilfs_btree_node *node,
+                         const struct nilfs_btree_node *node, int index)
-                         int index)
 {
-        return nilfs_bmap_dptr_to_ptr(*(nilfs_btree_node_dptrs(btree, node) +
+        return nilfs_bmap_dptr_to_ptr(*(nilfs_btree_node_dptrs(node, btree) +
                                        index));
 }
 static inline void
 nilfs_btree_node_set_ptr(struct nilfs_btree *btree,
-                         struct nilfs_btree_node *node,
+                         struct nilfs_btree_node *node, int index, __u64 ptr)
-                         int index,
-                         __u64 ptr)
 {
-        *(nilfs_btree_node_dptrs(btree, node) + index) =
+        *(nilfs_btree_node_dptrs(node, btree) + index) =
                nilfs_bmap_ptr_to_dptr(ptr);
 }
@@ -283,12 +247,12 @@ static void nilfs_btree_node_init(struct nilfs_btree *btree,
        __le64 *dptrs;
        int i;
-        nilfs_btree_node_set_flags(btree, node, flags);
+        nilfs_btree_node_set_flags(node, flags);
-        nilfs_btree_node_set_level(btree, node, level);
+        nilfs_btree_node_set_level(node, level);
-        nilfs_btree_node_set_nchildren(btree, node, nchildren);
+        nilfs_btree_node_set_nchildren(node, nchildren);
-        dkeys = nilfs_btree_node_dkeys(btree, node);
+        dkeys = nilfs_btree_node_dkeys(node);
-        dptrs = nilfs_btree_node_dptrs(btree, node);
+        dptrs = nilfs_btree_node_dptrs(node, btree);
        for (i = 0; i < nchildren; i++) {
                dkeys[i] = nilfs_bmap_key_to_dkey(keys[i]);
                dptrs[i] = nilfs_bmap_ptr_to_dptr(ptrs[i]);
@@ -305,13 +269,13 @@ static void nilfs_btree_node_move_left(struct nilfs_btree *btree,
        __le64 *ldptrs, *rdptrs;
        int lnchildren, rnchildren;
-        ldkeys = nilfs_btree_node_dkeys(btree, left);
+        ldkeys = nilfs_btree_node_dkeys(left);
-        ldptrs = nilfs_btree_node_dptrs(btree, left);
+        ldptrs = nilfs_btree_node_dptrs(left, btree);
-        lnchildren = nilfs_btree_node_get_nchildren(btree, left);
+        lnchildren = nilfs_btree_node_get_nchildren(left);
-        rdkeys = nilfs_btree_node_dkeys(btree, right);
+        rdkeys = nilfs_btree_node_dkeys(right);
-        rdptrs = nilfs_btree_node_dptrs(btree, right);
+        rdptrs = nilfs_btree_node_dptrs(right, btree);
-        rnchildren = nilfs_btree_node_get_nchildren(btree, right);
+        rnchildren = nilfs_btree_node_get_nchildren(right);
        memcpy(ldkeys + lnchildren, rdkeys, n * sizeof(*rdkeys));
        memcpy(ldptrs + lnchildren, rdptrs, n * sizeof(*rdptrs));
@@ -320,8 +284,8 @@ static void nilfs_btree_node_move_left(struct nilfs_btree *btree,
        lnchildren += n;
        rnchildren -= n;
-        nilfs_btree_node_set_nchildren(btree, left, lnchildren);
+        nilfs_btree_node_set_nchildren(left, lnchildren);
-        nilfs_btree_node_set_nchildren(btree, right, rnchildren);
+        nilfs_btree_node_set_nchildren(right, rnchildren);
 }
 /* Assume that the buffer heads corresponding to left and right are locked. */
@@ -334,13 +298,13 @@ static void nilfs_btree_node_move_right(struct nilfs_btree *btree,
        __le64 *ldptrs, *rdptrs;
        int lnchildren, rnchildren;
-        ldkeys = nilfs_btree_node_dkeys(btree, left);
+        ldkeys = nilfs_btree_node_dkeys(left);
-        ldptrs = nilfs_btree_node_dptrs(btree, left);
+        ldptrs = nilfs_btree_node_dptrs(left, btree);
-        lnchildren = nilfs_btree_node_get_nchildren(btree, left);
+        lnchildren = nilfs_btree_node_get_nchildren(left);
-        rdkeys = nilfs_btree_node_dkeys(btree, right);
+        rdkeys = nilfs_btree_node_dkeys(right);
-        rdptrs = nilfs_btree_node_dptrs(btree, right);
+        rdptrs = nilfs_btree_node_dptrs(right, btree);
-        rnchildren = nilfs_btree_node_get_nchildren(btree, right);
+        rnchildren = nilfs_btree_node_get_nchildren(right);
        memmove(rdkeys + n, rdkeys, rnchildren * sizeof(*rdkeys));
        memmove(rdptrs + n, rdptrs, rnchildren * sizeof(*rdptrs));
@@ -349,8 +313,8 @@ static void nilfs_btree_node_move_right(struct nilfs_btree *btree,
        lnchildren -= n;
        rnchildren += n;
-        nilfs_btree_node_set_nchildren(btree, left, lnchildren);
+        nilfs_btree_node_set_nchildren(left, lnchildren);
-        nilfs_btree_node_set_nchildren(btree, right, rnchildren);
+        nilfs_btree_node_set_nchildren(right, rnchildren);
 }
 /* Assume that the buffer head corresponding to node is locked. */
@@ -362,9 +326,9 @@ static void nilfs_btree_node_insert(struct nilfs_btree *btree,
        __le64 *dptrs;
        int nchildren;
-        dkeys = nilfs_btree_node_dkeys(btree, node);
+        dkeys = nilfs_btree_node_dkeys(node);
-        dptrs = nilfs_btree_node_dptrs(btree, node);
+        dptrs = nilfs_btree_node_dptrs(node, btree);
-        nchildren = nilfs_btree_node_get_nchildren(btree, node);
+        nchildren = nilfs_btree_node_get_nchildren(node);
        if (index < nchildren) {
                memmove(dkeys + index + 1, dkeys + index,
                        (nchildren - index) * sizeof(*dkeys));
@@ -374,7 +338,7 @@ static void nilfs_btree_node_insert(struct nilfs_btree *btree,
        dkeys[index] = nilfs_bmap_key_to_dkey(key);
        dptrs[index] = nilfs_bmap_ptr_to_dptr(ptr);
        nchildren++;
-        nilfs_btree_node_set_nchildren(btree, node, nchildren);
+        nilfs_btree_node_set_nchildren(node, nchildren);
 }
 /* Assume that the buffer head corresponding to node is locked. */
@@ -388,11 +352,11 @@ static void nilfs_btree_node_delete(struct nilfs_btree *btree,
        __le64 *dptrs;
        int nchildren;
-        dkeys = nilfs_btree_node_dkeys(btree, node);
+        dkeys = nilfs_btree_node_dkeys(node);
-        dptrs = nilfs_btree_node_dptrs(btree, node);
+        dptrs = nilfs_btree_node_dptrs(node, btree);
        key = nilfs_bmap_dkey_to_key(dkeys[index]);
        ptr = nilfs_bmap_dptr_to_ptr(dptrs[index]);
-        nchildren = nilfs_btree_node_get_nchildren(btree, node);
+        nchildren = nilfs_btree_node_get_nchildren(node);
        if (keyp != NULL)
                *keyp = key;
        if (ptrp != NULL)
@@ -405,11 +369,10 @@ static void nilfs_btree_node_delete(struct nilfs_btree *btree,
                        (nchildren - index - 1) * sizeof(*dptrs));
        }
        nchildren--;
-        nilfs_btree_node_set_nchildren(btree, node, nchildren);
+        nilfs_btree_node_set_nchildren(node, nchildren);
 }
-static int nilfs_btree_node_lookup(const struct nilfs_btree *btree,
+static int nilfs_btree_node_lookup(const struct nilfs_btree_node *node,
-                                   const struct nilfs_btree_node *node,
                                   __u64 key, int *indexp)
 {
        __u64 nkey;
@@ -417,12 +380,12 @@ static int nilfs_btree_node_lookup(const struct nilfs_btree *btree,
        /* binary search */
        low = 0;
-        high = nilfs_btree_node_get_nchildren(btree, node) - 1;
+        high = nilfs_btree_node_get_nchildren(node) - 1;
        index = 0;
        s = 0;
        while (low <= high) {
                index = (low + high) / 2;
-                nkey = nilfs_btree_node_get_key(btree, node, index);
+                nkey = nilfs_btree_node_get_key(node, index);
                if (nkey == key) {
                        s = 0;
                        goto out;
@@ -436,9 +399,8 @@ static int nilfs_btree_node_lookup(const struct nilfs_btree *btree,
        }
        /* adjust index */
-        if (nilfs_btree_node_get_level(btree, node) >
+        if (nilfs_btree_node_get_level(node) > NILFS_BTREE_LEVEL_NODE_MIN) {
-            NILFS_BTREE_LEVEL_NODE_MIN) {
+                if (s > 0 && index > 0)
-                if ((s > 0) && (index > 0))
                        index--;
        } else if (s < 0)
                index++;
@@ -456,25 +418,20 @@ nilfs_btree_get_root(const struct nilfs_btree *btree)
 }
 static inline struct nilfs_btree_node *
-nilfs_btree_get_nonroot_node(const struct nilfs_btree *btree,
+nilfs_btree_get_nonroot_node(const struct nilfs_btree_path *path, int level)
-                             const struct nilfs_btree_path *path,
-                             int level)
 {
        return (struct nilfs_btree_node *)path[level].bp_bh->b_data;
 }
 static inline struct nilfs_btree_node *
-nilfs_btree_get_sib_node(const struct nilfs_btree *btree,
+nilfs_btree_get_sib_node(const struct nilfs_btree_path *path, int level)
-                         const struct nilfs_btree_path *path,
-                         int level)
 {
        return (struct nilfs_btree_node *)path[level].bp_sib_bh->b_data;
 }
 static inline int nilfs_btree_height(const struct nilfs_btree *btree)
 {
-        return nilfs_btree_node_get_level(btree, nilfs_btree_get_root(btree))
+        return nilfs_btree_node_get_level(nilfs_btree_get_root(btree)) + 1;
-                + 1;
 }
 static inline struct nilfs_btree_node *
@@ -484,7 +441,7 @@ nilfs_btree_get_node(const struct nilfs_btree *btree,
 {
        return (level == nilfs_btree_height(btree) - 1) ?
                nilfs_btree_get_root(btree) :
-                nilfs_btree_get_nonroot_node(btree, path, level);
+                nilfs_btree_get_nonroot_node(path, level);
 }
 static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
@@ -496,12 +453,11 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
        int level, index, found, ret;
        node = nilfs_btree_get_root(btree);
-        level = nilfs_btree_node_get_level(btree, node);
+        level = nilfs_btree_node_get_level(node);
-        if ((level < minlevel) ||
+        if (level < minlevel || nilfs_btree_node_get_nchildren(node) <= 0)
-            (nilfs_btree_node_get_nchildren(btree, node) <= 0))
                return -ENOENT;
-        found = nilfs_btree_node_lookup(btree, node, key, &index);
+        found = nilfs_btree_node_lookup(node, key, &index);
        ptr = nilfs_btree_node_get_ptr(btree, node, index);
        path[level].bp_bh = NULL;
        path[level].bp_index = index;
@@ -510,14 +466,13 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
                ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh);
                if (ret < 0)
                        return ret;
-                node = nilfs_btree_get_nonroot_node(btree, path, level);
+                node = nilfs_btree_get_nonroot_node(path, level);
-                BUG_ON(level != nilfs_btree_node_get_level(btree, node));
+                BUG_ON(level != nilfs_btree_node_get_level(node));
                if (!found)
-                        found = nilfs_btree_node_lookup(btree, node, key,
+                        found = nilfs_btree_node_lookup(node, key, &index);
-                                                        &index);
                else
                        index = 0;
-                if (index < nilfs_btree_node_nchildren_max(btree, node))
+                if (index < nilfs_btree_node_nchildren_max(node, btree))
                        ptr = nilfs_btree_node_get_ptr(btree, node, index);
                else {
                        WARN_ON(found || level != NILFS_BTREE_LEVEL_NODE_MIN);
@@ -544,10 +499,10 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
        int index, level, ret;
        node = nilfs_btree_get_root(btree);
-        index = nilfs_btree_node_get_nchildren(btree, node) - 1;
+        index = nilfs_btree_node_get_nchildren(node) - 1;
        if (index < 0)
                return -ENOENT;
-        level = nilfs_btree_node_get_level(btree, node);
+        level = nilfs_btree_node_get_level(node);
        ptr = nilfs_btree_node_get_ptr(btree, node, index);
        path[level].bp_bh = NULL;
        path[level].bp_index = index;
@@ -556,15 +511,15 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
                ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh);
                if (ret < 0)
                        return ret;
-                node = nilfs_btree_get_nonroot_node(btree, path, level);
+                node = nilfs_btree_get_nonroot_node(path, level);
-                BUG_ON(level != nilfs_btree_node_get_level(btree, node));
+                BUG_ON(level != nilfs_btree_node_get_level(node));
-                index = nilfs_btree_node_get_nchildren(btree, node) - 1;
+                index = nilfs_btree_node_get_nchildren(node) - 1;
                ptr = nilfs_btree_node_get_ptr(btree, node, index);
                path[level].bp_index = index;
        }
        if (keyp != NULL)
-                *keyp = nilfs_btree_node_get_key(btree, node, index);
+                *keyp = nilfs_btree_node_get_key(node, index);
        if (ptrp != NULL)
                *ptrp = ptr;
@@ -580,18 +535,18 @@ static int nilfs_btree_lookup(const struct nilfs_bmap *bmap,
        int ret;
        btree = (struct nilfs_btree *)bmap;
-        path = nilfs_btree_alloc_path(btree);
+        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(btree, path);
+        nilfs_btree_init_path(path);
        ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
        if (ptrp != NULL)
                *ptrp = ptr;
-        nilfs_btree_clear_path(btree, path);
+        nilfs_btree_release_path(path);
-        nilfs_btree_free_path(btree, path);
+        nilfs_btree_free_path(path);
        return ret;
 }
@@ -608,10 +563,10 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
        int level = NILFS_BTREE_LEVEL_NODE_MIN;
        int ret, cnt, index, maxlevel;
-        path = nilfs_btree_alloc_path(btree);
+        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(btree, path);
+        nilfs_btree_init_path(path);
        ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
        if (ret < 0)
                goto out;
@@ -631,8 +586,8 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
        node = nilfs_btree_get_node(btree, path, level);
        index = path[level].bp_index + 1;
        for (;;) {
-                while (index < nilfs_btree_node_get_nchildren(btree, node)) {
+                while (index < nilfs_btree_node_get_nchildren(node)) {
-                        if (nilfs_btree_node_get_key(btree, node, index) !=
+                        if (nilfs_btree_node_get_key(node, index) !=
                            key + cnt)
                                goto end;
                        ptr2 = nilfs_btree_node_get_ptr(btree, node, index);
@@ -653,8 +608,8 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
                /* look-up right sibling node */
                node = nilfs_btree_get_node(btree, path, level + 1);
                index = path[level + 1].bp_index + 1;
-                if (index >= nilfs_btree_node_get_nchildren(btree, node) ||
+                if (index >= nilfs_btree_node_get_nchildren(node) ||
-                    nilfs_btree_node_get_key(btree, node, index) != key + cnt)
+                    nilfs_btree_node_get_key(node, index) != key + cnt)
                        break;
                ptr2 = nilfs_btree_node_get_ptr(btree, node, index);
                path[level + 1].bp_index = index;
@@ -664,7 +619,7 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
                ret = nilfs_btree_get_block(btree, ptr2, &path[level].bp_bh);
                if (ret < 0)
                        goto out;
-                node = nilfs_btree_get_nonroot_node(btree, path, level);
+                node = nilfs_btree_get_nonroot_node(path, level);
                index = 0;
                path[level].bp_index = index;
        }
@@ -672,8 +627,8 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
        *ptrp = ptr;
        ret = cnt;
 out:
-        nilfs_btree_clear_path(btree, path);
+        nilfs_btree_release_path(path);
-        nilfs_btree_free_path(btree, path);
+        nilfs_btree_free_path(path);
        return ret;
 }
@@ -685,9 +640,7 @@ static void nilfs_btree_promote_key(struct nilfs_btree *btree,
                do {
                        lock_buffer(path[level].bp_bh);
                        nilfs_btree_node_set_key(
-                                btree,
+                                nilfs_btree_get_nonroot_node(path, level),
-                                nilfs_btree_get_nonroot_node(
-                                        btree, path, level),
                                path[level].bp_index, key);
                        if (!buffer_dirty(path[level].bp_bh))
                                nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -698,8 +651,7 @@ static void nilfs_btree_promote_key(struct nilfs_btree *btree,
        /* root */
        if (level == nilfs_btree_height(btree) - 1) {
-                nilfs_btree_node_set_key(btree,
+                nilfs_btree_node_set_key(nilfs_btree_get_root(btree),
-                                         nilfs_btree_get_root(btree),
                                         path[level].bp_index, key);
        }
 }
@@ -712,7 +664,7 @@ static void nilfs_btree_do_insert(struct nilfs_btree *btree,
        if (level < nilfs_btree_height(btree) - 1) {
                lock_buffer(path[level].bp_bh);
-                node = nilfs_btree_get_nonroot_node(btree, path, level);
+                node = nilfs_btree_get_nonroot_node(path, level);
                nilfs_btree_node_insert(btree, node, *keyp, *ptrp,
                                        path[level].bp_index);
                if (!buffer_dirty(path[level].bp_bh))
@@ -721,8 +673,8 @@ static void nilfs_btree_do_insert(struct nilfs_btree *btree,
                if (path[level].bp_index == 0)
                        nilfs_btree_promote_key(btree, path, level + 1,
-                                                nilfs_btree_node_get_key(
+                                                nilfs_btree_node_get_key(node,
-                                                        btree, node, 0));
+                                                                         0));
        } else {
                node = nilfs_btree_get_root(btree);
                nilfs_btree_node_insert(btree, node, *keyp, *ptrp,
@@ -740,10 +692,10 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree,
        lock_buffer(path[level].bp_bh);
        lock_buffer(path[level].bp_sib_bh);
-        node = nilfs_btree_get_nonroot_node(btree, path, level);
+        node = nilfs_btree_get_nonroot_node(path, level);
-        left = nilfs_btree_get_sib_node(btree, path, level);
+        left = nilfs_btree_get_sib_node(path, level);
-        nchildren = nilfs_btree_node_get_nchildren(btree, node);
+        nchildren = nilfs_btree_node_get_nchildren(node);
-        lnchildren = nilfs_btree_node_get_nchildren(btree, left);
+        lnchildren = nilfs_btree_node_get_nchildren(left);
        move = 0;
        n = (nchildren + lnchildren + 1) / 2 - lnchildren;
@@ -764,7 +716,7 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree,
        unlock_buffer(path[level].bp_sib_bh);
        nilfs_btree_promote_key(btree, path, level + 1,
-                                nilfs_btree_node_get_key(btree, node, 0));
+                                nilfs_btree_node_get_key(node, 0));
        if (move) {
                brelse(path[level].bp_bh);
@@ -791,10 +743,10 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree,
        lock_buffer(path[level].bp_bh);
        lock_buffer(path[level].bp_sib_bh);
-        node = nilfs_btree_get_nonroot_node(btree, path, level);
+        node = nilfs_btree_get_nonroot_node(path, level);
-        right = nilfs_btree_get_sib_node(btree, path, level);
+        right = nilfs_btree_get_sib_node(path, level);
-        nchildren = nilfs_btree_node_get_nchildren(btree, node);
+        nchildren = nilfs_btree_node_get_nchildren(node);
-        rnchildren = nilfs_btree_node_get_nchildren(btree, right);
+        rnchildren = nilfs_btree_node_get_nchildren(right);
        move = 0;
        n = (nchildren + rnchildren + 1) / 2 - rnchildren;
@@ -816,15 +768,14 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree,
        path[level + 1].bp_index++;
        nilfs_btree_promote_key(btree, path, level + 1,
-                                nilfs_btree_node_get_key(btree, right, 0));
+                                nilfs_btree_node_get_key(right, 0));
        path[level + 1].bp_index--;
        if (move) {
                brelse(path[level].bp_bh);
                path[level].bp_bh = path[level].bp_sib_bh;
                path[level].bp_sib_bh = NULL;
-                path[level].bp_index -=
+                path[level].bp_index -= nilfs_btree_node_get_nchildren(node);
-                        nilfs_btree_node_get_nchildren(btree, node);
                path[level + 1].bp_index++;
        } else {
                brelse(path[level].bp_sib_bh);
@@ -846,9 +797,9 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
        lock_buffer(path[level].bp_bh);
        lock_buffer(path[level].bp_sib_bh);
-        node = nilfs_btree_get_nonroot_node(btree, path, level);
+        node = nilfs_btree_get_nonroot_node(path, level);
-        right = nilfs_btree_get_sib_node(btree, path, level);
+        right = nilfs_btree_get_sib_node(path, level);
-        nchildren = nilfs_btree_node_get_nchildren(btree, node);
+        nchildren = nilfs_btree_node_get_nchildren(node);
        move = 0;
        n = (nchildren + 1) / 2;
@@ -867,16 +818,15 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
        unlock_buffer(path[level].bp_bh);
        unlock_buffer(path[level].bp_sib_bh);
-        newkey = nilfs_btree_node_get_key(btree, right, 0);
+        newkey = nilfs_btree_node_get_key(right, 0);
        newptr = path[level].bp_newreq.bpr_ptr;
        if (move) {
-                path[level].bp_index -=
+                path[level].bp_index -= nilfs_btree_node_get_nchildren(node);
-                        nilfs_btree_node_get_nchildren(btree, node);
                nilfs_btree_node_insert(btree, right, *keyp, *ptrp,
                                        path[level].bp_index);
-                *keyp = nilfs_btree_node_get_key(btree, right, 0);
+                *keyp = nilfs_btree_node_get_key(right, 0);
                *ptrp = path[level].bp_newreq.bpr_ptr;
                brelse(path[level].bp_bh);
@@ -885,7 +835,7 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
        } else {
                nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
-                *keyp = nilfs_btree_node_get_key(btree, right, 0);
+                *keyp = nilfs_btree_node_get_key(right, 0);
                *ptrp = path[level].bp_newreq.bpr_ptr;
                brelse(path[level].bp_sib_bh);
@@ -905,12 +855,12 @@ static void nilfs_btree_grow(struct nilfs_btree *btree,
        lock_buffer(path[level].bp_sib_bh);
        root = nilfs_btree_get_root(btree);
-        child = nilfs_btree_get_sib_node(btree, path, level);
+        child = nilfs_btree_get_sib_node(path, level);
-        n = nilfs_btree_node_get_nchildren(btree, root);
+        n = nilfs_btree_node_get_nchildren(root);
        nilfs_btree_node_move_right(btree, root, child, n);
-        nilfs_btree_node_set_level(btree, root, level + 1);
+        nilfs_btree_node_set_level(root, level + 1);
        if (!buffer_dirty(path[level].bp_sib_bh))
                nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
@@ -922,7 +872,7 @@ static void nilfs_btree_grow(struct nilfs_btree *btree,
        nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
-        *keyp = nilfs_btree_node_get_key(btree, child, 0);
+        *keyp = nilfs_btree_node_get_key(child, 0);
        *ptrp = path[level].bp_newreq.bpr_ptr;
 }
@@ -990,26 +940,29 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
        struct nilfs_btree_node *node, *parent, *sib;
        __u64 sibptr;
        int pindex, level, ret;
+        struct inode *dat = NULL;
        stats->bs_nblocks = 0;
        level = NILFS_BTREE_LEVEL_DATA;
        /* allocate a new ptr for data block */
-        if (NILFS_BMAP_USE_VBN(&btree->bt_bmap))
+        if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) {
                path[level].bp_newreq.bpr_ptr =
                        nilfs_btree_find_target_v(btree, path, key);
+                dat = nilfs_bmap_get_dat(&btree->bt_bmap);
+        }
        ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
-                                           &path[level].bp_newreq);
+                                           &path[level].bp_newreq, dat);
        if (ret < 0)
                goto err_out_data;
        for (level = NILFS_BTREE_LEVEL_NODE_MIN;
             level < nilfs_btree_height(btree) - 1;
             level++) {
-                node = nilfs_btree_get_nonroot_node(btree, path, level);
+                node = nilfs_btree_get_nonroot_node(path, level);
-                if (nilfs_btree_node_get_nchildren(btree, node) <
+                if (nilfs_btree_node_get_nchildren(node) <
-                    nilfs_btree_node_nchildren_max(btree, node)) {
+                    nilfs_btree_node_nchildren_max(node, btree)) {
                        path[level].bp_op = nilfs_btree_do_insert;
                        stats->bs_nblocks++;
                        goto out;
@@ -1026,8 +979,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
                        if (ret < 0)
                                goto err_out_child_node;
                        sib = (struct nilfs_btree_node *)bh->b_data;
-                        if (nilfs_btree_node_get_nchildren(btree, sib) <
+                        if (nilfs_btree_node_get_nchildren(sib) <
-                            nilfs_btree_node_nchildren_max(btree, sib)) {
+                            nilfs_btree_node_nchildren_max(sib, btree)) {
                                path[level].bp_sib_bh = bh;
                                path[level].bp_op = nilfs_btree_carry_left;
                                stats->bs_nblocks++;
@@ -1038,15 +991,15 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
                /* right sibling */
                if (pindex <
-                    nilfs_btree_node_get_nchildren(btree, parent) - 1) {
+                    nilfs_btree_node_get_nchildren(parent) - 1) {
                        sibptr = nilfs_btree_node_get_ptr(btree, parent,
                                                          pindex + 1);
                        ret = nilfs_btree_get_block(btree, sibptr, &bh);
                        if (ret < 0)
                                goto err_out_child_node;
                        sib = (struct nilfs_btree_node *)bh->b_data;
-                        if (nilfs_btree_node_get_nchildren(btree, sib) <
+                        if (nilfs_btree_node_get_nchildren(sib) <
-                            nilfs_btree_node_nchildren_max(btree, sib)) {
+                            nilfs_btree_node_nchildren_max(sib, btree)) {
                                path[level].bp_sib_bh = bh;
                                path[level].bp_op = nilfs_btree_carry_right;
                                stats->bs_nblocks++;
@@ -1059,7 +1012,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
                path[level].bp_newreq.bpr_ptr =
                        path[level - 1].bp_newreq.bpr_ptr + 1;
                ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
-                                                   &path[level].bp_newreq);
+                                                   &path[level].bp_newreq, dat);
                if (ret < 0)
                        goto err_out_child_node;
                ret = nilfs_btree_get_new_block(btree,
@@ -1081,8 +1034,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
        /* root */
        node = nilfs_btree_get_root(btree);
-        if (nilfs_btree_node_get_nchildren(btree, node) <
+        if (nilfs_btree_node_get_nchildren(node) <
-            nilfs_btree_node_nchildren_max(btree, node)) {
+            nilfs_btree_node_nchildren_max(node, btree)) {
                path[level].bp_op = nilfs_btree_do_insert;
                stats->bs_nblocks++;
                goto out;
@@ -1091,7 +1044,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
        /* grow */
        path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1;
        ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
-                                           &path[level].bp_newreq);
+                                           &path[level].bp_newreq, dat);
        if (ret < 0)
                goto err_out_child_node;
        ret = nilfs_btree_get_new_block(btree, path[level].bp_newreq.bpr_ptr,
@@ -1119,16 +1072,18 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
        /* error */
 err_out_curr_node:
-        nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq);
+        nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq,
+                                   dat);
 err_out_child_node:
        for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) {
                nilfs_btnode_delete(path[level].bp_sib_bh);
                nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap,
-                                           &path[level].bp_newreq);
+                                           &path[level].bp_newreq, dat);
        }
-        nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq);
+        nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq,
+                                   dat);
 err_out_data:
        *levelp = level;
        stats->bs_nblocks = 0;
@@ -1139,16 +1094,19 @@ static void nilfs_btree_commit_insert(struct nilfs_btree *btree,
                                      struct nilfs_btree_path *path,
                                      int maxlevel, __u64 key, __u64 ptr)
 {
+        struct inode *dat = NULL;
        int level;
        set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
        ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr;
-        if (NILFS_BMAP_USE_VBN(&btree->bt_bmap))
+        if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) {
                nilfs_btree_set_target_v(btree, key, ptr);
+                dat = nilfs_bmap_get_dat(&btree->bt_bmap);
+        }
        for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
                nilfs_bmap_commit_alloc_ptr(&btree->bt_bmap,
-                                            &path[level - 1].bp_newreq);
+                                            &path[level - 1].bp_newreq, dat);
                path[level].bp_op(btree, path, level, &key, &ptr);
        }
@@ -1164,10 +1122,10 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
        int level, ret;
        btree = (struct nilfs_btree *)bmap;
-        path = nilfs_btree_alloc_path(btree);
+        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(btree, path);
+        nilfs_btree_init_path(path);
        ret = nilfs_btree_do_lookup(btree, path, key, NULL,
                                    NILFS_BTREE_LEVEL_NODE_MIN);
@@ -1184,8 +1142,8 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
        nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
 out:
-        nilfs_btree_clear_path(btree, path);
+        nilfs_btree_release_path(path);
-        nilfs_btree_free_path(btree, path);
+        nilfs_btree_free_path(path);
        return ret;
 }
@@ -1197,7 +1155,7 @@ static void nilfs_btree_do_delete(struct nilfs_btree *btree,
        if (level < nilfs_btree_height(btree) - 1) {
                lock_buffer(path[level].bp_bh);
-                node = nilfs_btree_get_nonroot_node(btree, path, level);
+                node = nilfs_btree_get_nonroot_node(path, level);
                nilfs_btree_node_delete(btree, node, keyp, ptrp,
                                        path[level].bp_index);
                if (!buffer_dirty(path[level].bp_bh))
@@ -1205,7 +1163,7 @@ static void nilfs_btree_do_delete(struct nilfs_btree *btree,
                unlock_buffer(path[level].bp_bh);
                if (path[level].bp_index == 0)
                        nilfs_btree_promote_key(btree, path, level + 1,
-                                nilfs_btree_node_get_key(btree, node, 0));
+                                nilfs_btree_node_get_key(node, 0));
        } else {
                node = nilfs_btree_get_root(btree);
                nilfs_btree_node_delete(btree, node, keyp, ptrp,
@@ -1225,10 +1183,10 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
        lock_buffer(path[level].bp_bh);
        lock_buffer(path[level].bp_sib_bh);
-        node = nilfs_btree_get_nonroot_node(btree, path, level);
+        node = nilfs_btree_get_nonroot_node(path, level);
-        left = nilfs_btree_get_sib_node(btree, path, level);
+        left = nilfs_btree_get_sib_node(path, level);
-        nchildren = nilfs_btree_node_get_nchildren(btree, node);
+        nchildren = nilfs_btree_node_get_nchildren(node);
-        lnchildren = nilfs_btree_node_get_nchildren(btree, left);
+        lnchildren = nilfs_btree_node_get_nchildren(left);
        n = (nchildren + lnchildren) / 2 - nchildren;
@@ -1243,7 +1201,7 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
        unlock_buffer(path[level].bp_sib_bh);
        nilfs_btree_promote_key(btree, path, level + 1,
-                                nilfs_btree_node_get_key(btree, node, 0));
+                                nilfs_btree_node_get_key(node, 0));
        brelse(path[level].bp_sib_bh);
        path[level].bp_sib_bh = NULL;
@@ -1262,10 +1220,10 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
        lock_buffer(path[level].bp_bh);
        lock_buffer(path[level].bp_sib_bh);
-        node = nilfs_btree_get_nonroot_node(btree, path, level);
+        node = nilfs_btree_get_nonroot_node(path, level);
-        right = nilfs_btree_get_sib_node(btree, path, level);
+        right = nilfs_btree_get_sib_node(path, level);
-        nchildren = nilfs_btree_node_get_nchildren(btree, node);
+        nchildren = nilfs_btree_node_get_nchildren(node);
-        rnchildren = nilfs_btree_node_get_nchildren(btree, right);
+        rnchildren = nilfs_btree_node_get_nchildren(right);
        n = (nchildren + rnchildren) / 2 - nchildren;
@@ -1281,7 +1239,7 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
        path[level + 1].bp_index++;
        nilfs_btree_promote_key(btree, path, level + 1,
-                                nilfs_btree_node_get_key(btree, right, 0));
+                                nilfs_btree_node_get_key(right, 0));
        path[level + 1].bp_index--;
        brelse(path[level].bp_sib_bh);
@@ -1300,10 +1258,10 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree,
        lock_buffer(path[level].bp_bh);
        lock_buffer(path[level].bp_sib_bh);
-        node = nilfs_btree_get_nonroot_node(btree, path, level);
+        node = nilfs_btree_get_nonroot_node(path, level);
-        left = nilfs_btree_get_sib_node(btree, path, level);
+        left = nilfs_btree_get_sib_node(path, level);
-        n = nilfs_btree_node_get_nchildren(btree, node);
+        n = nilfs_btree_node_get_nchildren(node);
        nilfs_btree_node_move_left(btree, left, node, n);
@@ -1316,7 +1274,7 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree,
        nilfs_btnode_delete(path[level].bp_bh);
        path[level].bp_bh = path[level].bp_sib_bh;
        path[level].bp_sib_bh = NULL;
-        path[level].bp_index += nilfs_btree_node_get_nchildren(btree, left);
+        path[level].bp_index += nilfs_btree_node_get_nchildren(left);
 }
 static void nilfs_btree_concat_right(struct nilfs_btree *btree,
@@ -1331,10 +1289,10 @@ static void nilfs_btree_concat_right(struct nilfs_btree *btree,
        lock_buffer(path[level].bp_bh);
        lock_buffer(path[level].bp_sib_bh);
-        node = nilfs_btree_get_nonroot_node(btree, path, level);
+        node = nilfs_btree_get_nonroot_node(path, level);
-        right = nilfs_btree_get_sib_node(btree, path, level);
+        right = nilfs_btree_get_sib_node(path, level);
-        n = nilfs_btree_node_get_nchildren(btree, right);
+        n = nilfs_btree_node_get_nchildren(right);
        nilfs_btree_node_move_left(btree, node, right, n);
@@ -1360,11 +1318,11 @@ static void nilfs_btree_shrink(struct nilfs_btree *btree,
        lock_buffer(path[level].bp_bh);
        root = nilfs_btree_get_root(btree);
-        child = nilfs_btree_get_nonroot_node(btree, path, level);
+        child = nilfs_btree_get_nonroot_node(path, level);
        nilfs_btree_node_delete(btree, root, NULL, NULL, 0);
-        nilfs_btree_node_set_level(btree, root, level);
+        nilfs_btree_node_set_level(root, level);
-        n = nilfs_btree_node_get_nchildren(btree, child);
+        n = nilfs_btree_node_get_nchildren(child);
        nilfs_btree_node_move_left(btree, root, child, n);
        unlock_buffer(path[level].bp_bh);
@@ -1376,7 +1334,8 @@ static void nilfs_btree_shrink(struct nilfs_btree *btree,
 static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
                                      struct nilfs_btree_path *path,
                                      int *levelp,
-                                      struct nilfs_bmap_stats *stats)
+                                      struct nilfs_bmap_stats *stats,
+                                      struct inode *dat)
 {
        struct buffer_head *bh;
        struct nilfs_btree_node *node, *parent, *sib;
@@ -1388,17 +1347,17 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
        for (level = NILFS_BTREE_LEVEL_NODE_MIN;
             level < nilfs_btree_height(btree) - 1;
             level++) {
-                node = nilfs_btree_get_nonroot_node(btree, path, level);
+                node = nilfs_btree_get_nonroot_node(path, level);
                path[level].bp_oldreq.bpr_ptr =
                        nilfs_btree_node_get_ptr(btree, node,
                                                 path[level].bp_index);
                ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap,
-                                                 &path[level].bp_oldreq);
+                                                 &path[level].bp_oldreq, dat);
                if (ret < 0)
                        goto err_out_child_node;
-                if (nilfs_btree_node_get_nchildren(btree, node) >
+                if (nilfs_btree_node_get_nchildren(node) >
-                    nilfs_btree_node_nchildren_min(btree, node)) {
+                    nilfs_btree_node_nchildren_min(node, btree)) {
                        path[level].bp_op = nilfs_btree_do_delete;
                        stats->bs_nblocks++;
                        goto out;
@@ -1415,8 +1374,8 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
                        if (ret < 0)
                                goto err_out_curr_node;
                        sib = (struct nilfs_btree_node *)bh->b_data;
-                        if (nilfs_btree_node_get_nchildren(btree, sib) >
+                        if (nilfs_btree_node_get_nchildren(sib) >
-                            nilfs_btree_node_nchildren_min(btree, sib)) {
+                            nilfs_btree_node_nchildren_min(sib, btree)) {
                                path[level].bp_sib_bh = bh;
                                path[level].bp_op = nilfs_btree_borrow_left;
                                stats->bs_nblocks++;
@@ -1428,7 +1387,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
                                /* continue; */
                        }
                } else if (pindex <
-                           nilfs_btree_node_get_nchildren(btree, parent) - 1) {
+                           nilfs_btree_node_get_nchildren(parent) - 1) {
                        /* right sibling */
                        sibptr = nilfs_btree_node_get_ptr(btree, parent,
                                                          pindex + 1);
@@ -1436,8 +1395,8 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
                        if (ret < 0)
                                goto err_out_curr_node;
                        sib = (struct nilfs_btree_node *)bh->b_data;
-                        if (nilfs_btree_node_get_nchildren(btree, sib) >
+                        if (nilfs_btree_node_get_nchildren(sib) >
-                            nilfs_btree_node_nchildren_min(btree, sib)) {
+                            nilfs_btree_node_nchildren_min(sib, btree)) {
                                path[level].bp_sib_bh = bh;
                                path[level].bp_op = nilfs_btree_borrow_right;
                                stats->bs_nblocks++;
@@ -1452,7 +1411,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
                        /* no siblings */
                        /* the only child of the root node */
                        WARN_ON(level != nilfs_btree_height(btree) - 2);
-                        if (nilfs_btree_node_get_nchildren(btree, node) - 1 <=
+                        if (nilfs_btree_node_get_nchildren(node) - 1 <=
                            NILFS_BTREE_ROOT_NCHILDREN_MAX) {
                                path[level].bp_op = nilfs_btree_shrink;
                                stats->bs_nblocks += 2;
@@ -1471,7 +1430,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
                nilfs_btree_node_get_ptr(btree, node, path[level].bp_index);
        ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap,
-                                         &path[level].bp_oldreq);
+                                         &path[level].bp_oldreq, dat);
        if (ret < 0)
                goto err_out_child_node;
@@ -1486,12 +1445,12 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
        /* error */
 err_out_curr_node:
-        nilfs_bmap_abort_end_ptr(&btree->bt_bmap, &path[level].bp_oldreq);
+        nilfs_bmap_abort_end_ptr(&btree->bt_bmap, &path[level].bp_oldreq, dat);
 err_out_child_node:
        for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) {
                brelse(path[level].bp_sib_bh);
                nilfs_bmap_abort_end_ptr(&btree->bt_bmap,
-                                         &path[level].bp_oldreq);
+                                         &path[level].bp_oldreq, dat);
        }
        *levelp = level;
        stats->bs_nblocks = 0;
@@ -1500,13 +1459,13 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
 static void nilfs_btree_commit_delete(struct nilfs_btree *btree,
                                      struct nilfs_btree_path *path,
-                                      int maxlevel)
+                                      int maxlevel, struct inode *dat)
 {
        int level;
        for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
                nilfs_bmap_commit_end_ptr(&btree->bt_bmap,
-                                          &path[level].bp_oldreq);
+                                          &path[level].bp_oldreq, dat);
                path[level].bp_op(btree, path, level, NULL, NULL);
        }
@@ -1520,27 +1479,32 @@ static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key)
        struct nilfs_btree *btree;
        struct nilfs_btree_path *path;
        struct nilfs_bmap_stats stats;
+        struct inode *dat;
        int level, ret;
        btree = (struct nilfs_btree *)bmap;
-        path = nilfs_btree_alloc_path(btree);
+        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(btree, path);
+        nilfs_btree_init_path(path);
        ret = nilfs_btree_do_lookup(btree, path, key, NULL,
                                    NILFS_BTREE_LEVEL_NODE_MIN);
        if (ret < 0)
                goto out;
-        ret = nilfs_btree_prepare_delete(btree, path, &level, &stats);
+        dat = NILFS_BMAP_USE_VBN(&btree->bt_bmap) ?
+                nilfs_bmap_get_dat(&btree->bt_bmap) : NULL;
+        ret = nilfs_btree_prepare_delete(btree, path, &level, &stats, dat);
        if (ret < 0)
                goto out;
-        nilfs_btree_commit_delete(btree, path, level);
+        nilfs_btree_commit_delete(btree, path, level, dat);
        nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
 out:
-        nilfs_btree_clear_path(btree, path);
+        nilfs_btree_release_path(path);
-        nilfs_btree_free_path(btree, path);
+        nilfs_btree_free_path(path);
        return ret;
 }
@@ -1551,15 +1515,15 @@ static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
        int ret;
        btree = (struct nilfs_btree *)bmap;
-        path = nilfs_btree_alloc_path(btree);
+        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(btree, path);
+        nilfs_btree_init_path(path);
        ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL);
-        nilfs_btree_clear_path(btree, path);
+        nilfs_btree_release_path(path);
-        nilfs_btree_free_path(btree, path);
+        nilfs_btree_free_path(path);
        return ret;
 }
@@ -1581,7 +1545,7 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
                node = root;
                break;
        case 3:
-                nchildren = nilfs_btree_node_get_nchildren(btree, root);
+                nchildren = nilfs_btree_node_get_nchildren(root);
                if (nchildren > 1)
                        return 0;
                ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
@@ -1594,10 +1558,10 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
                return 0;
        }
-        nchildren = nilfs_btree_node_get_nchildren(btree, node);
+        nchildren = nilfs_btree_node_get_nchildren(node);
-        maxkey = nilfs_btree_node_get_key(btree, node, nchildren - 1);
+        maxkey = nilfs_btree_node_get_key(node, nchildren - 1);
        nextmaxkey = (nchildren > 1) ?
-                nilfs_btree_node_get_key(btree, node, nchildren - 2) : 0;
+                nilfs_btree_node_get_key(node, nchildren - 2) : 0;
        if (bh != NULL)
                brelse(bh);
@@ -1623,7 +1587,7 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
                node = root;
                break;
        case 3:
-                nchildren = nilfs_btree_node_get_nchildren(btree, root);
+                nchildren = nilfs_btree_node_get_nchildren(root);
                WARN_ON(nchildren > 1);
                ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
                ret = nilfs_btree_get_block(btree, ptr, &bh);
@@ -1636,11 +1600,11 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
                return -EINVAL;
        }
-        nchildren = nilfs_btree_node_get_nchildren(btree, node);
+        nchildren = nilfs_btree_node_get_nchildren(node);
        if (nchildren < nitems)
                nitems = nchildren;
-        dkeys = nilfs_btree_node_dkeys(btree, node);
+        dkeys = nilfs_btree_node_dkeys(node);
-        dptrs = nilfs_btree_node_dptrs(btree, node);
+        dptrs = nilfs_btree_node_dptrs(node, btree);
        for (i = 0; i < nitems; i++) {
                keys[i] = nilfs_bmap_dkey_to_key(dkeys[i]);
                ptrs[i] = nilfs_bmap_dptr_to_ptr(dptrs[i]);
@@ -1660,18 +1624,20 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
                                       struct nilfs_bmap_stats *stats)
 {
        struct buffer_head *bh;
-        struct nilfs_btree *btree;
+        struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
+        struct inode *dat = NULL;
        int ret;
-        btree = (struct nilfs_btree *)bmap;
        stats->bs_nblocks = 0;
        /* for data */
        /* cannot find near ptr */
-        if (NILFS_BMAP_USE_VBN(bmap))
+        if (NILFS_BMAP_USE_VBN(bmap)) {
                dreq->bpr_ptr = nilfs_btree_find_target_v(btree, NULL, key);
+                dat = nilfs_bmap_get_dat(bmap);
+        }
-        ret = nilfs_bmap_prepare_alloc_ptr(bmap, dreq);
+        ret = nilfs_bmap_prepare_alloc_ptr(bmap, dreq, dat);
        if (ret < 0)
                return ret;
@@ -1679,7 +1645,7 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
        stats->bs_nblocks++;
        if (nreq != NULL) {
                nreq->bpr_ptr = dreq->bpr_ptr + 1;
-                ret = nilfs_bmap_prepare_alloc_ptr(bmap, nreq);
+                ret = nilfs_bmap_prepare_alloc_ptr(bmap, nreq, dat);
                if (ret < 0)
                        goto err_out_dreq;
@@ -1696,9 +1662,9 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
        /* error */
 err_out_nreq:
-        nilfs_bmap_abort_alloc_ptr(bmap, nreq);
+        nilfs_bmap_abort_alloc_ptr(bmap, nreq, dat);
 err_out_dreq:
-        nilfs_bmap_abort_alloc_ptr(bmap, dreq);
+        nilfs_bmap_abort_alloc_ptr(bmap, dreq, dat);
        stats->bs_nblocks = 0;
        return ret;
@@ -1713,8 +1679,9 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
                                      union nilfs_bmap_ptr_req *nreq,
                                      struct buffer_head *bh)
 {
-        struct nilfs_btree *btree;
+        struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
        struct nilfs_btree_node *node;
+        struct inode *dat;
        __u64 tmpptr;
        /* free resources */
@@ -1725,11 +1692,11 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
        set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
        /* convert and insert */
-        btree = (struct nilfs_btree *)bmap;
+        dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL;
        nilfs_btree_init(bmap);
        if (nreq != NULL) {
-                nilfs_bmap_commit_alloc_ptr(bmap, dreq);
+                nilfs_bmap_commit_alloc_ptr(bmap, dreq, dat);
-                nilfs_bmap_commit_alloc_ptr(bmap, nreq);
+                nilfs_bmap_commit_alloc_ptr(bmap, nreq, dat);
                /* create child node at level 1 */
                lock_buffer(bh);
@@ -1751,7 +1718,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
                nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT,
                                      2, 1, &keys[0], &tmpptr);
        } else {
-                nilfs_bmap_commit_alloc_ptr(bmap, dreq);
+                nilfs_bmap_commit_alloc_ptr(bmap, dreq, dat);
                /* create root node at level 1 */
                node = nilfs_btree_get_root(btree);
@@ -1822,7 +1789,7 @@ static int nilfs_btree_propagate_p(struct nilfs_btree *btree,
 static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
                                        struct nilfs_btree_path *path,
-                                        int level)
+                                        int level, struct inode *dat)
 {
        struct nilfs_btree_node *parent;
        int ret;
@@ -1832,9 +1799,8 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
                nilfs_btree_node_get_ptr(btree, parent,
                                         path[level + 1].bp_index);
        path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1;
-        ret = nilfs_bmap_prepare_update_v(&btree->bt_bmap,
+        ret = nilfs_dat_prepare_update(dat, &path[level].bp_oldreq.bpr_req,
-                                          &path[level].bp_oldreq,
+                                       &path[level].bp_newreq.bpr_req);
-                                          &path[level].bp_newreq);
        if (ret < 0)
                return ret;
@@ -1846,9 +1812,9 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
                        &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
                        &path[level].bp_ctxt);
                if (ret < 0) {
-                        nilfs_bmap_abort_update_v(&btree->bt_bmap,
+                        nilfs_dat_abort_update(dat,
-                                                  &path[level].bp_oldreq,
+                                               &path[level].bp_oldreq.bpr_req,
-                                                  &path[level].bp_newreq);
+                                               &path[level].bp_newreq.bpr_req);
                        return ret;
                }
        }
@@ -1858,13 +1824,13 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
 static void nilfs_btree_commit_update_v(struct nilfs_btree *btree,
                                        struct nilfs_btree_path *path,
-                                        int level)
+                                        int level, struct inode *dat)
 {
        struct nilfs_btree_node *parent;
-        nilfs_bmap_commit_update_v(&btree->bt_bmap,
+        nilfs_dat_commit_update(dat, &path[level].bp_oldreq.bpr_req,
-                                   &path[level].bp_oldreq,
+                                &path[level].bp_newreq.bpr_req,
-                                   &path[level].bp_newreq);
+                                btree->bt_bmap.b_ptr_type == NILFS_BMAP_PTR_VS);
        if (buffer_nilfs_node(path[level].bp_bh)) {
                nilfs_btnode_commit_change_key(
@@ -1881,11 +1847,10 @@ static void nilfs_btree_commit_update_v(struct nilfs_btree *btree,
 static void nilfs_btree_abort_update_v(struct nilfs_btree *btree,
                                       struct nilfs_btree_path *path,
-                                       int level)
+                                       int level, struct inode *dat)
 {
-        nilfs_bmap_abort_update_v(&btree->bt_bmap,
+        nilfs_dat_abort_update(dat, &path[level].bp_oldreq.bpr_req,
-                                  &path[level].bp_oldreq,
+                               &path[level].bp_newreq.bpr_req);
-                                  &path[level].bp_newreq);
        if (buffer_nilfs_node(path[level].bp_bh))
                nilfs_btnode_abort_change_key(
                        &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
@@ -1894,14 +1859,14 @@ static void nilfs_btree_abort_update_v(struct nilfs_btree *btree,
 static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree,
                                           struct nilfs_btree_path *path,
-                                           int minlevel,
+                                           int minlevel, int *maxlevelp,
-                                           int *maxlevelp)
+                                           struct inode *dat)
 {
        int level, ret;
        level = minlevel;
        if (!buffer_nilfs_volatile(path[level].bp_bh)) {
-                ret = nilfs_btree_prepare_update_v(btree, path, level);
+                ret = nilfs_btree_prepare_update_v(btree, path, level, dat);
                if (ret < 0)
                        return ret;
        }
@@ -1909,7 +1874,7 @@ static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree,
               !buffer_dirty(path[level].bp_bh)) {
                WARN_ON(buffer_nilfs_volatile(path[level].bp_bh));
-                ret = nilfs_btree_prepare_update_v(btree, path, level);
+                ret = nilfs_btree_prepare_update_v(btree, path, level, dat);
                if (ret < 0)
                        goto out;
        }
@@ -1921,39 +1886,40 @@ static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree,
        /* error */
 out:
        while (--level > minlevel)
-                nilfs_btree_abort_update_v(btree, path, level);
+                nilfs_btree_abort_update_v(btree, path, level, dat);
        if (!buffer_nilfs_volatile(path[level].bp_bh))
-                nilfs_btree_abort_update_v(btree, path, level);
+                nilfs_btree_abort_update_v(btree, path, level, dat);
        return ret;
 }
 static void nilfs_btree_commit_propagate_v(struct nilfs_btree *btree,
                                           struct nilfs_btree_path *path,
-                                           int minlevel,
+                                           int minlevel, int maxlevel,
-                                           int maxlevel,
+                                           struct buffer_head *bh,
-                                           struct buffer_head *bh)
+                                           struct inode *dat)
 {
        int level;
        if (!buffer_nilfs_volatile(path[minlevel].bp_bh))
-                nilfs_btree_commit_update_v(btree, path, minlevel);
+                nilfs_btree_commit_update_v(btree, path, minlevel, dat);
        for (level = minlevel + 1; level <= maxlevel; level++)
-                nilfs_btree_commit_update_v(btree, path, level);
+                nilfs_btree_commit_update_v(btree, path, level, dat);
 }
 static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
                                   struct nilfs_btree_path *path,
-                                   int level,
+                                   int level, struct buffer_head *bh)
-                                   struct buffer_head *bh)
 {
        int maxlevel, ret;
        struct nilfs_btree_node *parent;
+        struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap);
        __u64 ptr;
        get_bh(bh);
        path[level].bp_bh = bh;
-        ret = nilfs_btree_prepare_propagate_v(btree, path, level, &maxlevel);
+        ret = nilfs_btree_prepare_propagate_v(btree, path, level, &maxlevel,
+                                              dat);
        if (ret < 0)
                goto out;
@@ -1961,12 +1927,12 @@ static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
                parent = nilfs_btree_get_node(btree, path, level + 1);
                ptr = nilfs_btree_node_get_ptr(btree, parent,
                                               path[level + 1].bp_index);
-                ret = nilfs_bmap_mark_dirty(&btree->bt_bmap, ptr);
+                ret = nilfs_dat_mark_dirty(dat, ptr);
                if (ret < 0)
                        goto out;
        }
-        nilfs_btree_commit_propagate_v(btree, path, level, maxlevel, bh);
+        nilfs_btree_commit_propagate_v(btree, path, level, maxlevel, bh, dat);
 out:
        brelse(path[level].bp_bh);
@@ -1986,15 +1952,15 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
        WARN_ON(!buffer_dirty(bh));
        btree = (struct nilfs_btree *)bmap;
-        path = nilfs_btree_alloc_path(btree);
+        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(btree, path);
+        nilfs_btree_init_path(path);
        if (buffer_nilfs_node(bh)) {
                node = (struct nilfs_btree_node *)bh->b_data;
-                key = nilfs_btree_node_get_key(btree, node, 0);
+                key = nilfs_btree_node_get_key(node, 0);
-                level = nilfs_btree_node_get_level(btree, node);
+                level = nilfs_btree_node_get_level(node);
        } else {
                key = nilfs_bmap_data_get_key(bmap, bh);
                level = NILFS_BTREE_LEVEL_DATA;
@@ -2013,8 +1979,8 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
                nilfs_btree_propagate_p(btree, path, level, bh);
 out:
-        nilfs_btree_clear_path(btree, path);
+        nilfs_btree_release_path(path);
-        nilfs_btree_free_path(btree, path);
+        nilfs_btree_free_path(path);
        return ret;
 }
@@ -2022,7 +1988,7 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
 static int nilfs_btree_propagate_gc(const struct nilfs_bmap *bmap,
                                    struct buffer_head *bh)
 {
-        return nilfs_bmap_mark_dirty(bmap, bh->b_blocknr);
+        return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), bh->b_blocknr);
 }
 static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree,
@@ -2037,12 +2003,12 @@ static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree,
        get_bh(bh);
        node = (struct nilfs_btree_node *)bh->b_data;
-        key = nilfs_btree_node_get_key(btree, node, 0);
+        key = nilfs_btree_node_get_key(node, 0);
-        level = nilfs_btree_node_get_level(btree, node);
+        level = nilfs_btree_node_get_level(node);
        list_for_each(head, &lists[level]) {
                cbh = list_entry(head, struct buffer_head, b_assoc_buffers);
                cnode = (struct nilfs_btree_node *)cbh->b_data;
-                ckey = nilfs_btree_node_get_key(btree, cnode, 0);
+                ckey = nilfs_btree_node_get_key(cnode, 0);
                if (key < ckey)
                        break;
        }
@@ -2120,8 +2086,7 @@ static int nilfs_btree_assign_p(struct nilfs_btree *btree,
        nilfs_btree_node_set_ptr(btree, parent,
                                 path[level + 1].bp_index, blocknr);
-        key = nilfs_btree_node_get_key(btree, parent,
+        key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index);
-                                       path[level + 1].bp_index);
        /* on-disk format */
        binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key);
        binfo->bi_dat.bi_level = level;
@@ -2137,6 +2102,7 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree,
                                union nilfs_binfo *binfo)
 {
        struct nilfs_btree_node *parent;
+        struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap);
        __u64 key;
        __u64 ptr;
        union nilfs_bmap_ptr_req req;
@@ -2146,12 +2112,12 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree,
        ptr = nilfs_btree_node_get_ptr(btree, parent,
                                       path[level + 1].bp_index);
        req.bpr_ptr = ptr;
-        ret = nilfs_bmap_start_v(&btree->bt_bmap, &req, blocknr);
+        ret = nilfs_dat_prepare_start(dat, &req.bpr_req);
-        if (unlikely(ret < 0))
+        if (ret < 0)
                return ret;
+        nilfs_dat_commit_start(dat, &req.bpr_req, blocknr);
-        key = nilfs_btree_node_get_key(btree, parent,
+        key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index);
-                                       path[level + 1].bp_index);
        /* on-disk format */
        binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
        binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
@@ -2171,15 +2137,15 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
        int level, ret;
        btree = (struct nilfs_btree *)bmap;
-        path = nilfs_btree_alloc_path(btree);
+        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(btree, path);
+        nilfs_btree_init_path(path);
        if (buffer_nilfs_node(*bh)) {
                node = (struct nilfs_btree_node *)(*bh)->b_data;
-                key = nilfs_btree_node_get_key(btree, node, 0);
+                key = nilfs_btree_node_get_key(node, 0);
-                level = nilfs_btree_node_get_level(btree, node);
+                level = nilfs_btree_node_get_level(node);
        } else {
                key = nilfs_bmap_data_get_key(bmap, *bh);
                level = NILFS_BTREE_LEVEL_DATA;
@@ -2196,8 +2162,8 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
                nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo);
 out:
-        nilfs_btree_clear_path(btree, path);
+        nilfs_btree_release_path(path);
-        nilfs_btree_free_path(btree, path);
+        nilfs_btree_free_path(path);
        return ret;
 }
@@ -2207,19 +2173,18 @@ static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap,
                                 sector_t blocknr,
                                 union nilfs_binfo *binfo)
 {
-        struct nilfs_btree *btree;
        struct nilfs_btree_node *node;
        __u64 key;
        int ret;
-        btree = (struct nilfs_btree *)bmap;
+        ret = nilfs_dat_move(nilfs_bmap_get_dat(bmap), (*bh)->b_blocknr,
-        ret = nilfs_bmap_move_v(bmap, (*bh)->b_blocknr, blocknr);
+                             blocknr);
        if (ret < 0)
                return ret;
        if (buffer_nilfs_node(*bh)) {
                node = (struct nilfs_btree_node *)(*bh)->b_data;
-                key = nilfs_btree_node_get_key(btree, node, 0);
+                key = nilfs_btree_node_get_key(node, 0);
        } else
                key = nilfs_bmap_data_get_key(bmap, *bh);
@@ -2239,10 +2204,10 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
        int ret;
        btree = (struct nilfs_btree *)bmap;
-        path = nilfs_btree_alloc_path(btree);
+        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(btree, path);
+        nilfs_btree_init_path(path);
        ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1);
        if (ret < 0) {
@@ -2262,8 +2227,8 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
                nilfs_bmap_set_dirty(&btree->bt_bmap);
 out:
-        nilfs_btree_clear_path(btree, path);
+        nilfs_btree_release_path(path);
-        nilfs_btree_free_path(btree, path);
+        nilfs_btree_free_path(path);
        return ret;
 }
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index aec942cf79e3..1c6cfb59128d 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -815,8 +815,10 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
        void *kaddr;
        int ret;
-        if (cno == 0)
+        /* CP number is invalid if it's zero or larger than the
-                return -ENOENT; /* checkpoint number 0 is invalid */
+        largest exist one.*/
+        if (cno == 0 || cno >= nilfs_mdt_cno(cpfile))
+                return -ENOENT;
        down_read(&NILFS_MDT(cpfile)->mi_sem);
        ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh);
@@ -824,7 +826,10 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
                goto out;
        kaddr = kmap_atomic(bh->b_page, KM_USER0);
        cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
-        ret = nilfs_checkpoint_snapshot(cp);
+        if (nilfs_checkpoint_invalid(cp))
+                ret = -ENOENT;
+        else
+                ret = nilfs_checkpoint_snapshot(cp);
        kunmap_atomic(kaddr, KM_USER0);
        brelse(bh);
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index 788a45950197..debea896e701 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -27,8 +27,6 @@
 #include <linux/buffer_head.h>
 #include <linux/nilfs2_fs.h>
-#define NILFS_CPFILE_GFP        NILFS_MDT_GFP
 int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int,
                                struct nilfs_checkpoint **,
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 8927ca27e6f7..1ff8e15bd36b 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -109,12 +109,6 @@ void nilfs_dat_commit_free(struct inode *dat, struct nilfs_palloc_req *req)
        nilfs_palloc_commit_free_entry(dat, req);
 }
-void nilfs_dat_abort_free(struct inode *dat, struct nilfs_palloc_req *req)
-{
-        nilfs_dat_abort_entry(dat, req);
-        nilfs_palloc_abort_free_entry(dat, req);
-}
 int nilfs_dat_prepare_start(struct inode *dat, struct nilfs_palloc_req *req)
 {
        int ret;
@@ -140,11 +134,6 @@ void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req,
        nilfs_dat_commit_entry(dat, req);
 }
-void nilfs_dat_abort_start(struct inode *dat, struct nilfs_palloc_req *req)
-{
-        nilfs_dat_abort_entry(dat, req);
-}
 int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req)
 {
        struct nilfs_dat_entry *entry;
@@ -222,6 +211,37 @@ void nilfs_dat_abort_end(struct inode *dat, struct nilfs_palloc_req *req)
        nilfs_dat_abort_entry(dat, req);
 }
+int nilfs_dat_prepare_update(struct inode *dat,
+                             struct nilfs_palloc_req *oldreq,
+                             struct nilfs_palloc_req *newreq)
+{
+        int ret;
+        ret = nilfs_dat_prepare_end(dat, oldreq);
+        if (!ret) {
+                ret = nilfs_dat_prepare_alloc(dat, newreq);
+                if (ret < 0)
+                        nilfs_dat_abort_end(dat, oldreq);
+        }
+        return ret;
+}
+void nilfs_dat_commit_update(struct inode *dat,
+                             struct nilfs_palloc_req *oldreq,
+                             struct nilfs_palloc_req *newreq, int dead)
+{
+        nilfs_dat_commit_end(dat, oldreq, dead);
+        nilfs_dat_commit_alloc(dat, newreq);
+}
+void nilfs_dat_abort_update(struct inode *dat,
+                            struct nilfs_palloc_req *oldreq,
+                            struct nilfs_palloc_req *newreq)
+{
+        nilfs_dat_abort_end(dat, oldreq);
+        nilfs_dat_abort_alloc(dat, newreq);
+}
 /**
 * nilfs_dat_mark_dirty -
 * @dat: DAT file inode
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index d328b81eead4..406070d3ff49 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -27,7 +27,6 @@
 #include <linux/buffer_head.h>
 #include <linux/fs.h>
-#define NILFS_DAT_GFP   NILFS_MDT_GFP
 struct nilfs_palloc_req;
@@ -39,10 +38,15 @@ void nilfs_dat_abort_alloc(struct inode *, struct nilfs_palloc_req *);
 int nilfs_dat_prepare_start(struct inode *, struct nilfs_palloc_req *);
 void nilfs_dat_commit_start(struct inode *, struct nilfs_palloc_req *,
                            sector_t);
-void nilfs_dat_abort_start(struct inode *, struct nilfs_palloc_req *);
 int nilfs_dat_prepare_end(struct inode *, struct nilfs_palloc_req *);
 void nilfs_dat_commit_end(struct inode *, struct nilfs_palloc_req *, int);
 void nilfs_dat_abort_end(struct inode *, struct nilfs_palloc_req *);
+int nilfs_dat_prepare_update(struct inode *, struct nilfs_palloc_req *,
+                             struct nilfs_palloc_req *);
+void nilfs_dat_commit_update(struct inode *, struct nilfs_palloc_req *,
+                             struct nilfs_palloc_req *, int);
+void nilfs_dat_abort_update(struct inode *, struct nilfs_palloc_req *,
+                            struct nilfs_palloc_req *);
 int nilfs_dat_mark_dirty(struct inode *, __u64);
 int nilfs_dat_freev(struct inode *, __u64 *, size_t);
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index 342d9765df8d..d369ac718277 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -125,106 +125,64 @@ static void nilfs_direct_set_target_v(struct nilfs_direct *direct,
        direct->d_bmap.b_last_allocated_ptr = ptr;
 }
-static int nilfs_direct_prepare_insert(struct nilfs_direct *direct,
-                                       __u64 key,
-                                       union nilfs_bmap_ptr_req *req,
-                                       struct nilfs_bmap_stats *stats)
-{
-        int ret;
-        if (NILFS_BMAP_USE_VBN(&direct->d_bmap))
-                req->bpr_ptr = nilfs_direct_find_target_v(direct, key);
-        ret = nilfs_bmap_prepare_alloc_ptr(&direct->d_bmap, req);
-        if (ret < 0)
-                return ret;
-        stats->bs_nblocks = 1;
-        return 0;
-}
-static void nilfs_direct_commit_insert(struct nilfs_direct *direct,
-                                       union nilfs_bmap_ptr_req *req,
-                                       __u64 key, __u64 ptr)
-{
-        struct buffer_head *bh;
-        /* ptr must be a pointer to a buffer head. */
-        bh = (struct buffer_head *)((unsigned long)ptr);
-        set_buffer_nilfs_volatile(bh);
-        nilfs_bmap_commit_alloc_ptr(&direct->d_bmap, req);
-        nilfs_direct_set_ptr(direct, key, req->bpr_ptr);
-        if (!nilfs_bmap_dirty(&direct->d_bmap))
-                nilfs_bmap_set_dirty(&direct->d_bmap);
-        if (NILFS_BMAP_USE_VBN(&direct->d_bmap))
-                nilfs_direct_set_target_v(direct, key, req->bpr_ptr);
-}
 static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
 {
-        struct nilfs_direct *direct;
+        struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
        union nilfs_bmap_ptr_req req;
-        struct nilfs_bmap_stats stats;
+        struct inode *dat = NULL;
+        struct buffer_head *bh;
        int ret;
-        direct = (struct nilfs_direct *)bmap;
        if (key > NILFS_DIRECT_KEY_MAX)
                return -ENOENT;
        if (nilfs_direct_get_ptr(direct, key) != NILFS_BMAP_INVALID_PTR)
                return -EEXIST;
-        ret = nilfs_direct_prepare_insert(direct, key, &req, &stats);
+        if (NILFS_BMAP_USE_VBN(bmap)) {
-        if (ret < 0)
+                req.bpr_ptr = nilfs_direct_find_target_v(direct, key);
-                return ret;
+                dat = nilfs_bmap_get_dat(bmap);
-        nilfs_direct_commit_insert(direct, &req, key, ptr);
+        }
-        nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
+        ret = nilfs_bmap_prepare_alloc_ptr(bmap, &req, dat);
+        if (!ret) {
+                /* ptr must be a pointer to a buffer head. */
+                bh = (struct buffer_head *)((unsigned long)ptr);
+                set_buffer_nilfs_volatile(bh);
-        return 0;
+                nilfs_bmap_commit_alloc_ptr(bmap, &req, dat);
-}
+                nilfs_direct_set_ptr(direct, key, req.bpr_ptr);
-static int nilfs_direct_prepare_delete(struct nilfs_direct *direct,
+                if (!nilfs_bmap_dirty(bmap))
-                                       union nilfs_bmap_ptr_req *req,
+                        nilfs_bmap_set_dirty(bmap);
-                                       __u64 key,
-                                       struct nilfs_bmap_stats *stats)
-{
-        int ret;
-        req->bpr_ptr = nilfs_direct_get_ptr(direct, key);
+                if (NILFS_BMAP_USE_VBN(bmap))
-        ret = nilfs_bmap_prepare_end_ptr(&direct->d_bmap, req);
+                        nilfs_direct_set_target_v(direct, key, req.bpr_ptr);
-        if (!ret)
-                stats->bs_nblocks = 1;
-        return ret;
-}
-static void nilfs_direct_commit_delete(struct nilfs_direct *direct,
+                nilfs_bmap_add_blocks(bmap, 1);
-                                       union nilfs_bmap_ptr_req *req,
+        }
-                                       __u64 key)
+        return ret;
-{
-        nilfs_bmap_commit_end_ptr(&direct->d_bmap, req);
-        nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR);
 }
 static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key)
 {
-        struct nilfs_direct *direct;
+        struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
        union nilfs_bmap_ptr_req req;
-        struct nilfs_bmap_stats stats;
+        struct inode *dat;
        int ret;
-        direct = (struct nilfs_direct *)bmap;
+        if (key > NILFS_DIRECT_KEY_MAX ||
-        if ((key > NILFS_DIRECT_KEY_MAX) ||
            nilfs_direct_get_ptr(direct, key) == NILFS_BMAP_INVALID_PTR)
                return -ENOENT;
-        ret = nilfs_direct_prepare_delete(direct, &req, key, &stats);
+        dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL;
-        if (ret < 0)
+        req.bpr_ptr = nilfs_direct_get_ptr(direct, key);
-                return ret;
-        nilfs_direct_commit_delete(direct, &req, key);
-        nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
-        return 0;
+        ret = nilfs_bmap_prepare_end_ptr(bmap, &req, dat);
+        if (!ret) {
+                nilfs_bmap_commit_end_ptr(bmap, &req, dat);
+                nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR);
+                nilfs_bmap_sub_blocks(bmap, 1);
+        }
+        return ret;
 }
 static int nilfs_direct_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
@@ -310,59 +268,56 @@ int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
        return 0;
 }
-static int nilfs_direct_propagate_v(struct nilfs_direct *direct,
+static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
-                                    struct buffer_head *bh)
+                                  struct buffer_head *bh)
 {
-        union nilfs_bmap_ptr_req oldreq, newreq;
+        struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
+        struct nilfs_palloc_req oldreq, newreq;
+        struct inode *dat;
        __u64 key;
        __u64 ptr;
        int ret;
-        key = nilfs_bmap_data_get_key(&direct->d_bmap, bh);
+        if (!NILFS_BMAP_USE_VBN(bmap))
+                return 0;
+        dat = nilfs_bmap_get_dat(bmap);
+        key = nilfs_bmap_data_get_key(bmap, bh);
        ptr = nilfs_direct_get_ptr(direct, key);
        if (!buffer_nilfs_volatile(bh)) {
-                oldreq.bpr_ptr = ptr;
+                oldreq.pr_entry_nr = ptr;
-                newreq.bpr_ptr = ptr;
+                newreq.pr_entry_nr = ptr;
-                ret = nilfs_bmap_prepare_update_v(&direct->d_bmap, &oldreq,
+                ret = nilfs_dat_prepare_update(dat, &oldreq, &newreq);
-                                                  &newreq);
                if (ret < 0)
                        return ret;
-                nilfs_bmap_commit_update_v(&direct->d_bmap, &oldreq, &newreq);
+                nilfs_dat_commit_update(dat, &oldreq, &newreq,
+                                        bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
                set_buffer_nilfs_volatile(bh);
-                nilfs_direct_set_ptr(direct, key, newreq.bpr_ptr);
+                nilfs_direct_set_ptr(direct, key, newreq.pr_entry_nr);
        } else
-                ret = nilfs_bmap_mark_dirty(&direct->d_bmap, ptr);
+                ret = nilfs_dat_mark_dirty(dat, ptr);
        return ret;
 }
-static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
-                                  struct buffer_head *bh)
-{
-        struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
-        return NILFS_BMAP_USE_VBN(bmap) ?
-                nilfs_direct_propagate_v(direct, bh) : 0;
-}
 static int nilfs_direct_assign_v(struct nilfs_direct *direct,
                                 __u64 key, __u64 ptr,
                                 struct buffer_head **bh,
                                 sector_t blocknr,
                                 union nilfs_binfo *binfo)
 {
+        struct inode *dat = nilfs_bmap_get_dat(&direct->d_bmap);
        union nilfs_bmap_ptr_req req;
        int ret;
        req.bpr_ptr = ptr;
-        ret = nilfs_bmap_start_v(&direct->d_bmap, &req, blocknr);
+        ret = nilfs_dat_prepare_start(dat, &req.bpr_req);
-        if (unlikely(ret < 0))
+        if (!ret) {
-                return ret;
+                nilfs_dat_commit_start(dat, &req.bpr_req, blocknr);
+                binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
-        binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
+                binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
-        binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
+        }
+        return ret;
-        return 0;
 }
 static int nilfs_direct_assign_p(struct nilfs_direct *direct,
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index 5d30a35679b5..ecc3ba76db47 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -31,7 +31,6 @@
 #include "mdt.h"
 #include "alloc.h"
-#define NILFS_IFILE_GFP  NILFS_MDT_GFP
 static inline struct nilfs_inode *
 nilfs_ifile_map_inode(struct inode *ifile, ino_t ino, struct buffer_head *ibh)
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index fe9d8f2a13f8..807e584b163d 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -430,7 +430,8 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
        raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, bh);
-        if (nilfs_read_inode_common(inode, raw_inode))
+        err = nilfs_read_inode_common(inode, raw_inode);
+        if (err)
                goto failed_unmap;
        if (S_ISREG(inode->i_mode)) {
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 6ea5f872e2de..6572ea4bc4df 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -442,12 +442,6 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
        const char *msg;
        int ret;
-        ret = nilfs_ioctl_move_blocks(nilfs, &argv[0], kbufs[0]);
-        if (ret < 0) {
-                msg = "cannot read source blocks";
-                goto failed;
-        }
        ret = nilfs_ioctl_delete_checkpoints(nilfs, &argv[1], kbufs[1]);
        if (ret < 0) {
                /*
@@ -548,7 +542,25 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
                }
        }
-        ret = nilfs_clean_segments(inode->i_sb, argv, kbufs);
+        /*
+         * nilfs_ioctl_move_blocks() will call nilfs_gc_iget(),
+         * which will operates an inode list without blocking.
+         * To protect the list from concurrent operations,
+         * nilfs_ioctl_move_blocks should be atomic operation.
+         */
+        if (test_and_set_bit(THE_NILFS_GC_RUNNING, &nilfs->ns_flags)) {
+                ret = -EBUSY;
+                goto out_free;
+        }
+        ret = nilfs_ioctl_move_blocks(nilfs, &argv[0], kbufs[0]);
+        if (ret < 0)
+                printk(KERN_ERR "NILFS: GC failed during preparation: "
+                        "cannot read source blocks: err=%d\n", ret);
+        else
+                ret = nilfs_clean_segments(inode->i_sb, argv, kbufs);
+        clear_nilfs_gc_running(nilfs);
 out_free:
        while (--n >= 0)
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 2dfd47714ae5..156bf6091a96 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -103,15 +103,12 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
                goto failed_unlock;
        err = -EEXIST;
-        if (buffer_uptodate(bh) || buffer_mapped(bh))
+        if (buffer_uptodate(bh))
                goto failed_bh;
-#if 0
-        /* The uptodate flag is not protected by the page lock, but
-           the mapped flag is.  Thus, we don't have to wait the buffer. */
        wait_on_buffer(bh);
        if (buffer_uptodate(bh))
                goto failed_bh;
-#endif
        bh->b_bdev = nilfs->ns_bdev;
        err = nilfs_mdt_insert_new_block(inode, block, bh, init_block);
@@ -139,7 +136,7 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
                       int mode, struct buffer_head **out_bh)
 {
        struct buffer_head *bh;
-        unsigned long blknum = 0;
+        __u64 blknum = 0;
        int ret = -ENOMEM;
        bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0);
@@ -162,17 +159,15 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
                unlock_buffer(bh);
                goto out;
        }
-        if (!buffer_mapped(bh)) { /* unused buffer */
-                ret = nilfs_bmap_lookup(NILFS_I(inode)->i_bmap, blkoff,
+        ret = nilfs_bmap_lookup(NILFS_I(inode)->i_bmap, blkoff, &blknum);
-                                        &blknum);
+        if (unlikely(ret)) {
-                if (unlikely(ret)) {
+                unlock_buffer(bh);
-                        unlock_buffer(bh);
+                goto failed_bh;
-                        goto failed_bh;
-                }
-                bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev;
-                bh->b_blocknr = blknum;
-                set_buffer_mapped(bh);
        }
+        bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev;
+        bh->b_blocknr = (sector_t)blknum;
+        set_buffer_mapped(bh);
        bh->b_end_io = end_buffer_read_sync;
        get_bh(bh);
@@ -402,6 +397,7 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
        struct inode *inode = container_of(page->mapping,
                                           struct inode, i_data);
        struct super_block *sb = inode->i_sb;
+        struct the_nilfs *nilfs = NILFS_MDT(inode)->mi_nilfs;
        struct nilfs_sb_info *writer = NULL;
        int err = 0;
@@ -411,9 +407,10 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
        if (page->mapping->assoc_mapping)
                return 0; /* Do not request flush for shadow page cache */
        if (!sb) {
-                writer = nilfs_get_writer(NILFS_MDT(inode)->mi_nilfs);
+                down_read(&nilfs->ns_writer_sem);
+                writer = nilfs->ns_writer;
                if (!writer) {
-                        nilfs_put_writer(NILFS_MDT(inode)->mi_nilfs);
+                        up_read(&nilfs->ns_writer_sem);
                        return -EROFS;
                }
                sb = writer->s_super;
@@ -425,7 +422,7 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
                nilfs_flush_segment(sb, inode->i_ino);
        if (writer)
-                nilfs_put_writer(NILFS_MDT(inode)->mi_nilfs);
+                up_read(&nilfs->ns_writer_sem);
        return err;
 }
@@ -516,9 +513,10 @@ nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
 }
 struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb,
-                            ino_t ino, gfp_t gfp_mask)
+                            ino_t ino)
 {
-        struct inode *inode = nilfs_mdt_new_common(nilfs, sb, ino, gfp_mask);
+        struct inode *inode = nilfs_mdt_new_common(nilfs, sb, ino,
+                                                   NILFS_MDT_GFP);
        if (!inode)
                return NULL;
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index df683e0bca6a..431599733c9b 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -74,8 +74,7 @@ int nilfs_mdt_forget_block(struct inode *, unsigned long);
 int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long);
 int nilfs_mdt_fetch_dirty(struct inode *);
-struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t,
+struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t);
-                            gfp_t);
 struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *,
                                   ino_t, gfp_t);
 void nilfs_mdt_destroy(struct inode *);
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index d80cc71be749..6dc83591d118 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -552,7 +552,8 @@ static int recover_dsync_blocks(struct nilfs_sb_info *sbi,
                printk(KERN_WARNING
                       "NILFS warning: error recovering data block "
                       "(err=%d, ino=%lu, block-offset=%llu)\n",
-                       err, rb->ino, (unsigned long long)rb->blkoff);
+                       err, (unsigned long)rb->ino,
+                       (unsigned long long)rb->blkoff);
                if (!err2)
                        err2 = err;
 next:
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 9e3fe17bb96b..e6d9e37fa241 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -316,10 +316,10 @@ static struct bio *nilfs_alloc_seg_bio(struct super_block *sb, sector_t start,
 {
        struct bio *bio;
-        bio = bio_alloc(GFP_NOWAIT, nr_vecs);
+        bio = bio_alloc(GFP_NOIO, nr_vecs);
        if (bio == NULL) {
                while (!bio && (nr_vecs >>= 1))
-                        bio = bio_alloc(GFP_NOWAIT, nr_vecs);
+                        bio = bio_alloc(GFP_NOIO, nr_vecs);
        }
        if (likely(bio)) {
                bio->bi_bdev = sb->s_bdev;
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 51ff3d0a4ee2..683df89dbae5 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2501,7 +2501,8 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci,
                if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) &&
                    nilfs_discontinued(nilfs)) {
                        down_write(&nilfs->ns_sem);
-                        req->sb_err = nilfs_commit_super(sbi, 0);
+                        req->sb_err = nilfs_commit_super(sbi,
+                                        nilfs_altsb_need_update(nilfs));
                        up_write(&nilfs->ns_sem);
                }
        }
@@ -2689,6 +2690,7 @@ static int nilfs_segctor_thread(void *arg)
        } else {
                DEFINE_WAIT(wait);
                int should_sleep = 1;
+                struct the_nilfs *nilfs;
                prepare_to_wait(&sci->sc_wait_daemon, &wait,
                                TASK_INTERRUPTIBLE);
@@ -2709,6 +2711,9 @@ static int nilfs_segctor_thread(void *arg)
                finish_wait(&sci->sc_wait_daemon, &wait);
                timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
                           time_after_eq(jiffies, sci->sc_timer->expires));
+                nilfs = sci->sc_sbi->s_nilfs;
+                if (sci->sc_super->s_dirt && nilfs_sb_need_update(nilfs))
+                        set_nilfs_discontinued(nilfs);
        }
        goto loop;
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index a2c4d76c3366..0e99e5c0bd0f 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -28,7 +28,6 @@
 #include <linux/nilfs2_fs.h>
 #include "mdt.h"
-#define NILFS_SUFILE_GFP        NILFS_MDT_GFP
 static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile)
 {
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 151964f0de4c..55f3d6b60732 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -50,6 +50,8 @@
 #include <linux/writeback.h>
 #include <linux/kobject.h>
 #include <linux/exportfs.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
 #include "nilfs.h"
 #include "mdt.h"
 #include "alloc.h"
@@ -65,7 +67,6 @@ MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
                   "(NILFS)");
 MODULE_LICENSE("GPL");
-static void nilfs_write_super(struct super_block *sb);
 static int nilfs_remount(struct super_block *sb, int *flags, char *data);
 /**
@@ -311,9 +312,6 @@ static void nilfs_put_super(struct super_block *sb)
        lock_kernel();
-        if (sb->s_dirt)
-                nilfs_write_super(sb);
        nilfs_detach_segment_constructor(sbi);
        if (!(sb->s_flags & MS_RDONLY)) {
@@ -336,63 +334,21 @@ static void nilfs_put_super(struct super_block *sb)
        unlock_kernel();
 }
-/**
+static int nilfs_sync_fs(struct super_block *sb, int wait)
- * nilfs_write_super - write super block(s) of NILFS
- * @sb: super_block
- *
- * nilfs_write_super() gets a fs-dependent lock, writes super block(s), and
- * clears s_dirt.  This function is called in the section protected by
- * lock_super().
- *
- * The s_dirt flag is managed by each filesystem and we protect it by ns_sem
- * of the struct the_nilfs.  Lock order must be as follows:
- *
- *   1. lock_super()
- *   2.    down_write(&nilfs->ns_sem)
- *
- * Inside NILFS, locking ns_sem is enough to protect s_dirt and the buffer
- * of the super block (nilfs->ns_sbp[]).
- *
- * In most cases, VFS functions call lock_super() before calling these
- * methods.  So we must be careful not to bring on deadlocks when using
- * lock_super();  see generic_shutdown_super(), write_super(), and so on.
- *
- * Note that order of lock_kernel() and lock_super() depends on contexts
- * of VFS.  We should also note that lock_kernel() can be used in its
- * protective section and only the outermost one has an effect.
- */
-static void nilfs_write_super(struct super_block *sb)
 {
        struct nilfs_sb_info *sbi = NILFS_SB(sb);
        struct the_nilfs *nilfs = sbi->s_nilfs;
-        down_write(&nilfs->ns_sem);
-        if (!(sb->s_flags & MS_RDONLY)) {
-                struct nilfs_super_block **sbp = nilfs->ns_sbp;
-                u64 t = get_seconds();
-                int dupsb;
-                if (!nilfs_discontinued(nilfs) && t >= nilfs->ns_sbwtime[0] &&
-                    t < nilfs->ns_sbwtime[0] + NILFS_SB_FREQ) {
-                        up_write(&nilfs->ns_sem);
-                        return;
-                }
-                dupsb = sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ;
-                nilfs_commit_super(sbi, dupsb);
-        }
-        sb->s_dirt = 0;
-        up_write(&nilfs->ns_sem);
-}
-static int nilfs_sync_fs(struct super_block *sb, int wait)
-{
        int err = 0;
-        nilfs_write_super(sb);
        /* This function is called when super block should be written back */
        if (wait)
                err = nilfs_construct_segment(sb);
+        down_write(&nilfs->ns_sem);
+        if (sb->s_dirt)
+                nilfs_commit_super(sbi, 1);
+        up_write(&nilfs->ns_sem);
        return err;
 }
@@ -407,8 +363,7 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
        list_add(&sbi->s_list, &nilfs->ns_supers);
        up_write(&nilfs->ns_super_sem);
-        sbi->s_ifile = nilfs_mdt_new(
+        sbi->s_ifile = nilfs_mdt_new(nilfs, sbi->s_super, NILFS_IFILE_INO);
-                nilfs, sbi->s_super, NILFS_IFILE_INO, NILFS_IFILE_GFP);
        if (!sbi->s_ifile)
                return -ENOMEM;
@@ -529,6 +484,26 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        return 0;
 }
+static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
+{
+        struct super_block *sb = vfs->mnt_sb;
+        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        if (!nilfs_test_opt(sbi, BARRIER))
+                seq_printf(seq, ",barrier=off");
+        if (nilfs_test_opt(sbi, SNAPSHOT))
+                seq_printf(seq, ",cp=%llu",
+                           (unsigned long long int)sbi->s_snapshot_cno);
+        if (nilfs_test_opt(sbi, ERRORS_RO))
+                seq_printf(seq, ",errors=remount-ro");
+        if (nilfs_test_opt(sbi, ERRORS_PANIC))
+                seq_printf(seq, ",errors=panic");
+        if (nilfs_test_opt(sbi, STRICT_ORDER))
+                seq_printf(seq, ",order=strict");
+        return 0;
+}
 static struct super_operations nilfs_sops = {
        .alloc_inode    = nilfs_alloc_inode,
        .destroy_inode  = nilfs_destroy_inode,
@@ -538,7 +513,7 @@ static struct super_operations nilfs_sops = {
        /* .drop_inode    = nilfs_drop_inode, */
        .delete_inode   = nilfs_delete_inode,
        .put_super      = nilfs_put_super,
-        .write_super    = nilfs_write_super,
+        /* .write_super    = nilfs_write_super, */
        .sync_fs        = nilfs_sync_fs,
        /* .write_super_lockfs */
        /* .unlockfs */
@@ -546,7 +521,7 @@ static struct super_operations nilfs_sops = {
        .remount_fs     = nilfs_remount,
        .clear_inode    = nilfs_clear_inode,
        /* .umount_begin */
-        /* .show_options */
+        .show_options = nilfs_show_options
 };
 static struct inode *
@@ -816,10 +791,15 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
        if (sb->s_flags & MS_RDONLY) {
                if (nilfs_test_opt(sbi, SNAPSHOT)) {
+                        down_read(&nilfs->ns_segctor_sem);
                        err = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile,
                                                       sbi->s_snapshot_cno);
-                        if (err < 0)
+                        up_read(&nilfs->ns_segctor_sem);
+                        if (err < 0) {
+                                if (err == -ENOENT)
+                                        err = -EINVAL;
                                goto failed_sbi;
+                        }
                        if (!err) {
                                printk(KERN_ERR
                                       "NILFS: The specified checkpoint is "
@@ -1127,10 +1107,6 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
         */
        sd.sbi = nilfs_find_sbinfo(nilfs, !(flags & MS_RDONLY), sd.cno);
-        if (!sd.cno)
-                /* trying to get the latest checkpoint.  */
-                sd.cno = nilfs_last_cno(nilfs);
        /*
         * Get super block instance holding the nilfs_sb_info struct.
         * A new instance is allocated if no existing mount is present or
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 8b8889825716..ad391a8c3e7e 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -68,12 +68,11 @@ static struct the_nilfs *alloc_nilfs(struct block_device *bdev)
        nilfs->ns_bdev = bdev;
        atomic_set(&nilfs->ns_count, 1);
-        atomic_set(&nilfs->ns_writer_refcount, -1);
        atomic_set(&nilfs->ns_ndirtyblks, 0);
        init_rwsem(&nilfs->ns_sem);
        init_rwsem(&nilfs->ns_super_sem);
        mutex_init(&nilfs->ns_mount_mutex);
-        mutex_init(&nilfs->ns_writer_mutex);
+        init_rwsem(&nilfs->ns_writer_sem);
        INIT_LIST_HEAD(&nilfs->ns_list);
        INIT_LIST_HEAD(&nilfs->ns_supers);
        spin_lock_init(&nilfs->ns_last_segment_lock);
@@ -188,23 +187,19 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs,
        inode_size = nilfs->ns_inode_size;
        err = -ENOMEM;
-        nilfs->ns_dat = nilfs_mdt_new(
+        nilfs->ns_dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO);
-                nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP);
        if (unlikely(!nilfs->ns_dat))
                goto failed;
-        nilfs->ns_gc_dat = nilfs_mdt_new(
+        nilfs->ns_gc_dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO);
-                nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP);
        if (unlikely(!nilfs->ns_gc_dat))
                goto failed_dat;
-        nilfs->ns_cpfile = nilfs_mdt_new(
+        nilfs->ns_cpfile = nilfs_mdt_new(nilfs, NULL, NILFS_CPFILE_INO);
-                nilfs, NULL, NILFS_CPFILE_INO, NILFS_CPFILE_GFP);
        if (unlikely(!nilfs->ns_cpfile))
                goto failed_gc_dat;
-        nilfs->ns_sufile = nilfs_mdt_new(
+        nilfs->ns_sufile = nilfs_mdt_new(nilfs, NULL, NILFS_SUFILE_INO);
-                nilfs, NULL, NILFS_SUFILE_INO, NILFS_SUFILE_GFP);
        if (unlikely(!nilfs->ns_sufile))
                goto failed_cpfile;
@@ -596,9 +591,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
        nilfs->ns_mount_state = le16_to_cpu(sbp->s_state);
-        bdi = nilfs->ns_bdev->bd_inode_backing_dev_info;
+        bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info;
-        if (!bdi)
-                bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info;
        nilfs->ns_bdi = bdi ? : &default_backing_dev_info;
        /* Finding last segment */
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 1b9caafb8662..20abd55881e0 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -37,6 +37,7 @@ enum {
        THE_NILFS_LOADED,       /* Roll-back/roll-forward has done and
                                   the latest checkpoint was loaded */
        THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */
+        THE_NILFS_GC_RUNNING,   /* gc process is running */
 };
 /**
@@ -50,8 +51,7 @@ enum {
 * @ns_sem: semaphore for shared states
 * @ns_super_sem: semaphore for global operations across super block instances
 * @ns_mount_mutex: mutex protecting mount process of nilfs
- * @ns_writer_mutex: mutex protecting ns_writer attach/detach
+ * @ns_writer_sem: semaphore protecting ns_writer attach/detach
- * @ns_writer_refcount: number of referrers on ns_writer
 * @ns_current: back pointer to current mount
 * @ns_sbh: buffer heads of on-disk super blocks
 * @ns_sbp: pointers to super block data
@@ -100,8 +100,7 @@ struct the_nilfs {
        struct rw_semaphore     ns_sem;
        struct rw_semaphore     ns_super_sem;
        struct mutex            ns_mount_mutex;
-        struct mutex            ns_writer_mutex;
+        struct rw_semaphore     ns_writer_sem;
-        atomic_t                ns_writer_refcount;
        /*
         * components protected by ns_super_sem
@@ -197,11 +196,26 @@ static inline int nilfs_##name(struct the_nilfs *nilfs)			\
 THE_NILFS_FNS(INIT, init)
 THE_NILFS_FNS(LOADED, loaded)
 THE_NILFS_FNS(DISCONTINUED, discontinued)
+THE_NILFS_FNS(GC_RUNNING, gc_running)
 /* Minimum interval of periodical update of superblocks (in seconds) */
 #define NILFS_SB_FREQ           10
 #define NILFS_ALTSB_FREQ        60  /* spare superblock */
+static inline int nilfs_sb_need_update(struct the_nilfs *nilfs)
+{
+        u64 t = get_seconds();
+        return t < nilfs->ns_sbwtime[0] ||
+                 t > nilfs->ns_sbwtime[0] + NILFS_SB_FREQ;
+}
+static inline int nilfs_altsb_need_update(struct the_nilfs *nilfs)
+{
+        u64 t = get_seconds();
+        struct nilfs_super_block **sbp = nilfs->ns_sbp;
+        return sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ;
+}
 void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
 struct the_nilfs *find_or_create_nilfs(struct block_device *);
 void put_nilfs(struct the_nilfs *);
@@ -221,34 +235,21 @@ static inline void get_nilfs(struct the_nilfs *nilfs)
        atomic_inc(&nilfs->ns_count);
 }
-static inline struct nilfs_sb_info *nilfs_get_writer(struct the_nilfs *nilfs)
-{
-        if (atomic_inc_and_test(&nilfs->ns_writer_refcount))
-                mutex_lock(&nilfs->ns_writer_mutex);
-        return nilfs->ns_writer;
-}
-static inline void nilfs_put_writer(struct the_nilfs *nilfs)
-{
-        if (atomic_add_negative(-1, &nilfs->ns_writer_refcount))
-                mutex_unlock(&nilfs->ns_writer_mutex);
-}
 static inline void
 nilfs_attach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
 {
-        mutex_lock(&nilfs->ns_writer_mutex);
+        down_write(&nilfs->ns_writer_sem);
        nilfs->ns_writer = sbi;
-        mutex_unlock(&nilfs->ns_writer_mutex);
+        up_write(&nilfs->ns_writer_sem);
 }
 static inline void
 nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
 {
-        mutex_lock(&nilfs->ns_writer_mutex);
+        down_write(&nilfs->ns_writer_sem);
        if (sbi == nilfs->ns_writer)
                nilfs->ns_writer = NULL;
-        mutex_unlock(&nilfs->ns_writer_mutex);
+        up_write(&nilfs->ns_writer_sem);
 }
 static inline void nilfs_put_sbinfo(struct nilfs_sb_info *sbi)
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 3140a4429af1..4350d4993b18 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2076,14 +2076,6 @@ err_out:
        *ppos = pos;
        if (cached_page)
                page_cache_release(cached_page);
-        /* For now, when the user asks for O_SYNC, we actually give O_DSYNC. */
-        if (likely(!status)) {
-                if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(vi))) {
-                        if (!mapping->a_ops->writepage || !is_sync_kiocb(iocb))
-                                status = generic_osync_inode(vi, mapping,
-                                                OSYNC_METADATA|OSYNC_DATA);
-                }
-        }
        pagevec_lru_add_file(&lru_pvec);
        ntfs_debug("Done.  Returning %s (written 0x%lx, status %li).",
                        written ? "written" : "status", (unsigned long)written,
@@ -2145,8 +2137,8 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
        mutex_lock(&inode->i_mutex);
        ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos);
        mutex_unlock(&inode->i_mutex);
-        if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+        if (ret > 0) {
-                int err = sync_page_range(inode, mapping, pos, ret);
+                int err = generic_write_sync(file, pos, ret);
                if (err < 0)
                        ret = err;
        }
@@ -2173,8 +2165,8 @@ static ssize_t ntfs_file_writev(struct file *file, const struct iovec *iov,
        if (ret == -EIOCBQUEUED)
                ret = wait_on_sync_kiocb(&kiocb);
        mutex_unlock(&inode->i_mutex);
-        if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+        if (ret > 0) {
-                int err = sync_page_range(inode, mapping, *ppos - ret, ret);
+                int err = generic_write_sync(file, *ppos - ret, ret);
                if (err < 0)
                        ret = err;
        }
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 23bf68453d7d..1caa0ef0b2bb 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -384,13 +384,12 @@ unm_err_out:
 * it is dirty in the inode meta data rather than the data page cache of the
 * inode, and thus there are no data pages that need writing out.  Therefore, a
 * full mark_inode_dirty() is overkill.  A mark_inode_dirty_sync(), on the
- * other hand, is not sufficient, because I_DIRTY_DATASYNC needs to be set to
+ * other hand, is not sufficient, because ->write_inode needs to be called even
- * ensure ->write_inode is called from generic_osync_inode() and this needs to
+ * in case of fdatasync. This needs to happen or the file data would not
- * happen or the file data would not necessarily hit the device synchronously,
+ * necessarily hit the device synchronously, even though the vfs inode has the
- * even though the vfs inode has the O_SYNC flag set.  Also, I_DIRTY_DATASYNC
+ * O_SYNC flag set.  Also, I_DIRTY_DATASYNC simply "feels" better than just
- * simply "feels" better than just I_DIRTY_SYNC, since the file data has not
+ * I_DIRTY_SYNC, since the file data has not actually hit the block device yet,
- * actually hit the block device yet, which is not what I_DIRTY_SYNC on its own
+ * which is not what I_DIRTY_SYNC on its own would suggest.
- * would suggest.
 */
 void __mark_mft_record_dirty(ntfs_inode *ni)
 {
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index aa501d3f93f1..221c5e98957b 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1871,8 +1871,7 @@ relock:
                        goto out_dio;
                }
        } else {
-                written = generic_file_aio_write_nolock(iocb, iov, nr_segs,
+                written = __generic_file_aio_write(iocb, iov, nr_segs, ppos);
-                                                        *ppos);
        }
 out_dio:
@@ -1880,18 +1879,21 @@ out_dio:
        BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
        if ((file->f_flags & O_SYNC && !direct_io) || IS_SYNC(inode)) {
-                /*
+                ret = filemap_fdatawrite_range(file->f_mapping, pos,
-                 * The generic write paths have handled getting data
+                                               pos + count - 1);
-                 * to disk, but since we don't make use of the dirty
+                if (ret < 0)
-                 * inode list, a manual journal commit is necessary
+                        written = ret;
-                 * here.
-                 */
+                if (!ret && (old_size != i_size_read(inode) ||
-                if (old_size != i_size_read(inode) ||
+                    old_clusters != OCFS2_I(inode)->ip_clusters)) {
-                    old_clusters != OCFS2_I(inode)->ip_clusters) {
                        ret = jbd2_journal_force_commit(osb->journal->j_journal);
                        if (ret < 0)
                                written = ret;
                }
+                if (!ret)
+                        ret = filemap_fdatawait_range(file->f_mapping, pos,
+                                                      pos + count - 1);
        }
        /* 
@@ -1991,31 +1993,16 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
        if (ret > 0) {
                unsigned long nr_pages;
+                int err;
-                *ppos += ret;
                nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-                /*
+                err = generic_write_sync(out, *ppos, ret);
-                 * If file or inode is SYNC and we actually wrote some data,
+                if (err)
-                 * sync it.
+                        ret = err;
-                 */
+                else
-                if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
+                        *ppos += ret;
-                        int err;
-                        mutex_lock(&inode->i_mutex);
-                        err = ocfs2_rw_lock(inode, 1);
-                        if (err < 0) {
-                                mlog_errno(err);
-                        } else {
-                                err = generic_osync_inode(inode, mapping,
-                                                  OSYNC_METADATA|OSYNC_DATA);
-                                ocfs2_rw_unlock(inode, 1);
-                        }
-                        mutex_unlock(&inode->i_mutex);
-                        if (err)
-                                ret = err;
-                }
                balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
        }
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index ea4e6cb29e13..fbeaddf595d3 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -248,11 +248,19 @@ ssize_t part_stat_show(struct device *dev,
                part_stat_read(p, merges[WRITE]),
                (unsigned long long)part_stat_read(p, sectors[WRITE]),
                jiffies_to_msecs(part_stat_read(p, ticks[WRITE])),
-                p->in_flight,
+                part_in_flight(p),
                jiffies_to_msecs(part_stat_read(p, io_ticks)),
                jiffies_to_msecs(part_stat_read(p, time_in_queue)));
 }
+ssize_t part_inflight_show(struct device *dev,
+                        struct device_attribute *attr, char *buf)
+{
+        struct hd_struct *p = dev_to_part(dev);
+        return sprintf(buf, "%8u %8u\n", p->in_flight[0], p->in_flight[1]);
+}
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 ssize_t part_fail_show(struct device *dev,
                       struct device_attribute *attr, char *buf)
@@ -281,6 +289,7 @@ static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
 static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
+static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 static struct device_attribute dev_attr_fail =
        __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
@@ -292,6 +301,7 @@ static struct attribute *part_attrs[] = {
        &dev_attr_size.attr,
        &dev_attr_alignment_offset.attr,
        &dev_attr_stat.attr,
+        &dev_attr_inflight.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
        &dev_attr_fail.attr,
 #endif
@@ -302,7 +312,7 @@ static struct attribute_group part_attr_group = {
        .attrs = part_attrs,
 };
-static struct attribute_group *part_attr_groups[] = {
+static const struct attribute_group *part_attr_groups[] = {
        &part_attr_group,
 #ifdef CONFIG_BLK_DEV_IO_TRACE
        &blk_trace_attr_group,
diff --git a/fs/splice.c b/fs/splice.c
index 73766d24f97b..7394e9e17534 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -502,8 +502,10 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
                len = left;
        ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
-        if (ret > 0)
+        if (ret > 0) {
                *ppos += ret;
+                file_accessed(in);
+        }
        return ret;
 }
@@ -963,8 +965,10 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
                mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
                ret = file_remove_suid(out);
-                if (!ret)
+                if (!ret) {
+                        file_update_time(out);
                        ret = splice_from_pipe_feed(pipe, &sd, pipe_to_file);
+                }
                mutex_unlock(&inode->i_mutex);
        } while (ret > 0);
        splice_from_pipe_end(pipe, &sd);
@@ -976,25 +980,15 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
        if (ret > 0) {
                unsigned long nr_pages;
+                int err;
-                *ppos += ret;
                nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-                /*
+                err = generic_write_sync(out, *ppos, ret);
-                 * If file or inode is SYNC and we actually wrote some data,
+                if (err)
-                 * sync it.
+                        ret = err;
-                 */
+                else
-                if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
+                        *ppos += ret;
-                        int err;
-                        mutex_lock(&inode->i_mutex);
-                        err = generic_osync_inode(inode, mapping,
-                                                  OSYNC_METADATA|OSYNC_DATA);
-                        mutex_unlock(&inode->i_mutex);
-                        if (err)
-                                ret = err;
-                }
                balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
        }
diff --git a/fs/super.c b/fs/super.c
index 9cda337ddae2..b03fea8fbfb6 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -707,6 +707,12 @@ static int set_bdev_super(struct super_block *s, void *data)
 {
        s->s_bdev = data;
        s->s_dev = s->s_bdev->bd_dev;
+        /*
+         * We set the bdi here to the queue backing, file systems can
+         * overwrite this in ->fill_super()
+         */
+        s->s_bdi = &bdev_get_queue(s->s_bdev)->backing_dev_info;
        return 0;
 }
diff --git a/fs/sync.c b/fs/sync.c
index 103cc7fdd3df..c08467a5d7cb 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -27,6 +27,13 @@
 */
 static int __sync_filesystem(struct super_block *sb, int wait)
 {
+        /*
+         * This should be safe, as we require bdi backing to actually
+         * write out data in the first place
+         */
+        if (!sb->s_bdi)
+                return 0;
        /* Avoid doing twice syncing and cache pruning for quota sync */
        if (!wait) {
                writeout_quota_sb(sb, -1);
@@ -101,7 +108,7 @@ restart:
                spin_unlock(&sb_lock);
                down_read(&sb->s_umount);
-                if (!(sb->s_flags & MS_RDONLY) && sb->s_root)
+                if (!(sb->s_flags & MS_RDONLY) && sb->s_root && sb->s_bdi)
                        __sync_filesystem(sb, wait);
                up_read(&sb->s_umount);
@@ -178,19 +185,23 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
 }
 /**
- * vfs_fsync - perform a fsync or fdatasync on a file
+ * vfs_fsync_range - helper to sync a range of data & metadata to disk
 * @file:               file to sync
 * @dentry:             dentry of @file
- * @data:               only perform a fdatasync operation
+ * @start:              offset in bytes of the beginning of data range to sync
+ * @end:                offset in bytes of the end of data range (inclusive)
+ * @datasync:           perform only datasync
 *
- * Write back data and metadata for @file to disk.  If @datasync is
+ * Write back data in range @start..@end and metadata for @file to disk.  If
- * set only metadata needed to access modified file data is written.
+ * @datasync is set only metadata needed to access modified file data is
+ * written.
 *
 * In case this function is called from nfsd @file may be %NULL and
 * only @dentry is set.  This can only happen when the filesystem
 * implements the export_operations API.
 */
-int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start,
+                    loff_t end, int datasync)
 {
        const struct file_operations *fop;
        struct address_space *mapping;
@@ -214,7 +225,7 @@ int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
                goto out;
        }
-        ret = filemap_fdatawrite(mapping);
+        ret = filemap_write_and_wait_range(mapping, start, end);
        /*
         * We need to protect against concurrent writers, which could cause
@@ -225,12 +236,29 @@ int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
        if (!ret)
                ret = err;
        mutex_unlock(&mapping->host->i_mutex);
-        err = filemap_fdatawait(mapping);
-        if (!ret)
-                ret = err;
 out:
        return ret;
 }
+EXPORT_SYMBOL(vfs_fsync_range);
+/**
+ * vfs_fsync - perform a fsync or fdatasync on a file
+ * @file:               file to sync
+ * @dentry:             dentry of @file
+ * @datasync:           only perform a fdatasync operation
+ *
+ * Write back data and metadata for @file to disk.  If @datasync is
+ * set only metadata needed to access modified file data is written.
+ *
+ * In case this function is called from nfsd @file may be %NULL and
+ * only @dentry is set.  This can only happen when the filesystem
+ * implements the export_operations API.
+ */
+int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+        return vfs_fsync_range(file, dentry, 0, LLONG_MAX, datasync);
+}
 EXPORT_SYMBOL(vfs_fsync);
 static int do_fsync(unsigned int fd, int datasync)
@@ -256,6 +284,23 @@ SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
        return do_fsync(fd, 1);
 }
+/**
+ * generic_write_sync - perform syncing after a write if file / inode is sync
+ * @file:       file to which the write happened
+ * @pos:        offset where the write started
+ * @count:      length of the write
+ *
+ * This is just a simple wrapper about our general syncing function.
+ */
+int generic_write_sync(struct file *file, loff_t pos, loff_t count)
+{
+        if (!(file->f_flags & O_SYNC) && !IS_SYNC(file->f_mapping->host))
+                return 0;
+        return vfs_fsync_range(file, file->f_path.dentry, pos,
+                               pos + count - 1, 1);
+}
+EXPORT_SYMBOL(generic_write_sync);
 /*
 * sys_sync_file_range() permits finely controlled syncing over a segment of
 * a file in the range offset .. (offset+nbytes-1) inclusive.  If nbytes is
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 1c8991b0db13..ee1ce68fd98b 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -54,29 +54,15 @@
 * @nr_to_write: how many dirty pages to write-back
 *
 * This function shrinks UBIFS liability by means of writing back some amount
- * of dirty inodes and their pages. Returns the amount of pages which were
+ * of dirty inodes and their pages.
- * written back. The returned value does not include dirty inodes which were
- * synchronized.
 *
 * Note, this function synchronizes even VFS inodes which are locked
 * (@i_mutex) by the caller of the budgeting function, because write-back does
 * not touch @i_mutex.
 */
-static int shrink_liability(struct ubifs_info *c, int nr_to_write)
+static void shrink_liability(struct ubifs_info *c, int nr_to_write)
 {
-        int nr_written;
+        writeback_inodes_sb(c->vfs_sb);
-        nr_written = writeback_inodes_sb(c->vfs_sb);
-        if (!nr_written) {
-                /*
-                 * Re-try again but wait on pages/inodes which are being
-                 * written-back concurrently (e.g., by pdflush).
-                 */
-                nr_written = sync_inodes_sb(c->vfs_sb);
-        }
-        dbg_budg("%d pages were written back", nr_written);
-        return nr_written;
 }
 /**
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 51763aa8f4de..c4af069df1ad 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1980,6 +1980,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
        if (err)
                goto out_bdi;
+        sb->s_bdi = &c->bdi;
        sb->s_fs_info = c;
        sb->s_magic = UBIFS_SUPER_MAGIC;
        sb->s_blocksize = UBIFS_BLOCK_SIZE;
diff --git a/fs/udf/directory.c b/fs/udf/directory.c
index 1d2c570704c8..2ffdb6733af1 100644
--- a/fs/udf/directory.c
+++ b/fs/udf/directory.c
@@ -18,59 +18,6 @@
 #include <linux/string.h>
 #include <linux/buffer_head.h>
-#if 0
-static uint8_t *udf_filead_read(struct inode *dir, uint8_t *tmpad,
-                                uint8_t ad_size, struct kernel_lb_addr fe_loc,
-                                int *pos, int *offset, struct buffer_head **bh,
-                                int *error)
-{
-        int loffset = *offset;
-        int block;
-        uint8_t *ad;
-        int remainder;
-        *error = 0;
-        ad = (uint8_t *)(*bh)->b_data + *offset;
-        *offset += ad_size;
-        if (!ad) {
-                brelse(*bh);
-                *error = 1;
-                return NULL;
-        }
-        if (*offset == dir->i_sb->s_blocksize) {
-                brelse(*bh);
-                block = udf_get_lb_pblock(dir->i_sb, fe_loc, ++*pos);
-                if (!block)
-                        return NULL;
-                *bh = udf_tread(dir->i_sb, block);
-                if (!*bh)
-                        return NULL;
-        } else if (*offset > dir->i_sb->s_blocksize) {
-                ad = tmpad;
-                remainder = dir->i_sb->s_blocksize - loffset;
-                memcpy((uint8_t *)ad, (*bh)->b_data + loffset, remainder);
-                brelse(*bh);
-                block = udf_get_lb_pblock(dir->i_sb, fe_loc, ++*pos);
-                if (!block)
-                        return NULL;
-                (*bh) = udf_tread(dir->i_sb, block);
-                if (!*bh)
-                        return NULL;
-                memcpy((uint8_t *)ad + remainder, (*bh)->b_data,
-                        ad_size - remainder);
-                *offset = ad_size - remainder;
-        }
-        return ad;
-}
-#endif
 struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
                                         struct udf_fileident_bh *fibh,
                                         struct fileIdentDesc *cfi,
@@ -248,39 +195,6 @@ struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, int *offset)
        return fi;
 }
-#if 0
-static struct extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset)
-{
-        struct extent_ad *ext;
-        struct fileEntry *fe;
-        uint8_t *ptr;
-        if ((!buffer) || (!offset)) {
-                printk(KERN_ERR "udf: udf_get_fileextent() invalidparms\n");
-                return NULL;
-        }
-        fe = (struct fileEntry *)buffer;
-        if (fe->descTag.tagIdent != cpu_to_le16(TAG_IDENT_FE)) {
-                udf_debug("0x%x != TAG_IDENT_FE\n",
-                          le16_to_cpu(fe->descTag.tagIdent));
-                return NULL;
-        }
-        ptr = (uint8_t *)(fe->extendedAttr) +
-                le32_to_cpu(fe->lengthExtendedAttr);
-        if ((*offset > 0) && (*offset < le32_to_cpu(fe->lengthAllocDescs)))
-                ptr += *offset;
-        ext = (struct extent_ad *)ptr;
-        *offset = *offset + sizeof(struct extent_ad);
-        return ext;
-}
-#endif
 struct short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset,
                              int inc)
 {
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 7464305382b5..b80cbd78833c 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -193,9 +193,11 @@ int udf_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
 static int udf_release_file(struct inode *inode, struct file *filp)
 {
        if (filp->f_mode & FMODE_WRITE) {
+                mutex_lock(&inode->i_mutex);
                lock_kernel();
                udf_discard_prealloc(inode);
                unlock_kernel();
+                mutex_unlock(&inode->i_mutex);
        }
        return 0;
 }
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index e7533f785636..6d24c2c63f93 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -90,19 +90,16 @@ no_delete:
 }
 /*
- * If we are going to release inode from memory, we discard preallocation and
+ * If we are going to release inode from memory, we truncate last inode extent
- * truncate last inode extent to proper length. We could use drop_inode() but
+ * to proper length. We could use drop_inode() but it's called under inode_lock
- * it's called under inode_lock and thus we cannot mark inode dirty there.  We
+ * and thus we cannot mark inode dirty there.  We use clear_inode() but we have
- * use clear_inode() but we have to make sure to write inode as it's not written
+ * to make sure to write inode as it's not written automatically.
- * automatically.
 */
 void udf_clear_inode(struct inode *inode)
 {
        struct udf_inode_info *iinfo;
        if (!(inode->i_sb->s_flags & MS_RDONLY)) {
                lock_kernel();
-                /* Discard preallocation for directories, symlinks, etc. */
-                udf_discard_prealloc(inode);
                udf_truncate_tail_extent(inode);
                unlock_kernel();
                write_inode_now(inode, 0);
@@ -664,8 +661,12 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
        udf_split_extents(inode, &c, offset, newblocknum, laarr, &endnum);
 #ifdef UDF_PREALLOCATE
-        /* preallocate blocks */
+        /* We preallocate blocks only for regular files. It also makes sense
-        udf_prealloc_extents(inode, c, lastblock, laarr, &endnum);
+         * for directories but there's a problem when to drop the
+         * preallocation. We might use some delayed work for that but I feel
+         * it's overengineering for a filesystem like UDF. */
+        if (S_ISREG(inode->i_mode))
+                udf_prealloc_extents(inode, c, lastblock, laarr, &endnum);
 #endif
        /* merge any continuous blocks in laarr */
diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c
index 1b88fd5df05d..43e24a3b8e10 100644
--- a/fs/udf/lowlevel.c
+++ b/fs/udf/lowlevel.c
@@ -36,14 +36,10 @@ unsigned int udf_get_last_session(struct super_block *sb)
        ms_info.addr_format = CDROM_LBA;
        i = ioctl_by_bdev(bdev, CDROMMULTISESSION, (unsigned long)&ms_info);
-#define WE_OBEY_THE_WRITTEN_STANDARDS 1
        if (i == 0) {
                udf_debug("XA disk: %s, vol_desc_start=%d\n",
                          (ms_info.xa_flag ? "yes" : "no"), ms_info.addr.lba);
-#if WE_OBEY_THE_WRITTEN_STANDARDS
                if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */
-#endif
                        vol_desc_start = ms_info.addr.lba;
        } else {
                udf_debug("CDROMMULTISESSION not supported: rc=%d\n", i);
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 6a29fa34c478..21dad8c608f9 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -943,7 +943,6 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
                pc->componentType = 1;
                pc->lengthComponentIdent = 0;
                pc->componentFileVersionNum = 0;
-                pc += sizeof(struct pathComponent);
                elen += sizeof(struct pathComponent);
        }
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index aecf2519db76..d5e5559e31db 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -216,7 +216,6 @@ xfs_setfilesize(
        if (ip->i_d.di_size < isize) {
                ip->i_d.di_size = isize;
                ip->i_update_core = 1;
-                ip->i_update_size = 1;
                xfs_mark_inode_dirty_sync(ip);
        }
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 0542fd507649..988d8f87bc0f 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -172,12 +172,21 @@ xfs_file_release(
 */
 STATIC int
 xfs_file_fsync(
-        struct file     *filp,
+        struct file             *file,
-        struct dentry   *dentry,
+        struct dentry           *dentry,
-        int             datasync)
+        int                     datasync)
 {
-        xfs_iflags_clear(XFS_I(dentry->d_inode), XFS_ITRUNCATED);
+        struct inode            *inode = dentry->d_inode;
-        return -xfs_fsync(XFS_I(dentry->d_inode));
+        struct xfs_inode        *ip = XFS_I(inode);
+        int                     error;
+        /* capture size updates in I/O completion before writing the inode. */
+        error = filemap_fdatawait(inode->i_mapping);
+        if (error)
+                return error;
+        xfs_iflags_clear(ip, XFS_ITRUNCATED);
+        return -xfs_fsync(ip);
 }
 STATIC int
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 6c32f1d63d8c..da0159d99f82 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -43,7 +43,6 @@
 #include "xfs_error.h"
 #include "xfs_itable.h"
 #include "xfs_rw.h"
-#include "xfs_acl.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_utils.h"
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 7078974a6eee..49e4a6aea73c 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -812,18 +812,21 @@ write_retry:
        /* Handle various SYNC-type writes */
        if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
+                loff_t end = pos + ret - 1;
                int error2;
                xfs_iunlock(xip, iolock);
                if (need_i_mutex)
                        mutex_unlock(&inode->i_mutex);
-                error2 = sync_page_range(inode, mapping, pos, ret);
+                error2 = filemap_write_and_wait_range(mapping, pos, end);
                if (!error)
                        error = error2;
                if (need_i_mutex)
                        mutex_lock(&inode->i_mutex);
                xfs_ilock(xip, iolock);
-                error2 = xfs_write_sync_logforce(mp, xip);
+                error2 = xfs_fsync(xip);
                if (!error)
                        error = error2;
        }
diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/linux-2.6/xfs_stats.c
index c3526d445f6a..76fdc5861932 100644
--- a/fs/xfs/linux-2.6/xfs_stats.c
+++ b/fs/xfs/linux-2.6/xfs_stats.c
@@ -20,16 +20,9 @@
 DEFINE_PER_CPU(struct xfsstats, xfsstats);
-STATIC int
+static int xfs_stat_proc_show(struct seq_file *m, void *v)
-xfs_read_xfsstats(
-        char            *buffer,
-        char            **start,
-        off_t           offset,
-        int             count,
-        int             *eof,
-        void            *data)
 {
-        int             c, i, j, len, val;
+        int             c, i, j, val;
        __uint64_t      xs_xstrat_bytes = 0;
        __uint64_t      xs_write_bytes = 0;
        __uint64_t      xs_read_bytes = 0;
@@ -60,18 +53,18 @@ xfs_read_xfsstats(
        };
        /* Loop over all stats groups */
-        for (i=j=len = 0; i < ARRAY_SIZE(xstats); i++) {
+        for (i=j = 0; i < ARRAY_SIZE(xstats); i++) {
-                len += sprintf(buffer + len, "%s", xstats[i].desc);
+                seq_printf(m, "%s", xstats[i].desc);
                /* inner loop does each group */
                while (j < xstats[i].endpoint) {
                        val = 0;
                        /* sum over all cpus */
                        for_each_possible_cpu(c)
                                val += *(((__u32*)&per_cpu(xfsstats, c) + j));
-                        len += sprintf(buffer + len, " %u", val);
+                        seq_printf(m, " %u", val);
                        j++;
                }
-                buffer[len++] = '\n';
+                seq_putc(m, '\n');
        }
        /* extra precision counters */
        for_each_possible_cpu(i) {
@@ -80,36 +73,38 @@ xfs_read_xfsstats(
                xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes;
        }
-        len += sprintf(buffer + len, "xpc %Lu %Lu %Lu\n",
+        seq_printf(m, "xpc %Lu %Lu %Lu\n",
                        xs_xstrat_bytes, xs_write_bytes, xs_read_bytes);
-        len += sprintf(buffer + len, "debug %u\n",
+        seq_printf(m, "debug %u\n",
 #if defined(DEBUG)
                1);
 #else
                0);
 #endif
+        return 0;
+}
-        if (offset >= len) {
+static int xfs_stat_proc_open(struct inode *inode, struct file *file)
-                *start = buffer;
+{
-                *eof = 1;
+        return single_open(file, xfs_stat_proc_show, NULL);
-                return 0;
-        }
-        *start = buffer + offset;
-        if ((len -= offset) > count)
-                return count;
-        *eof = 1;
-        return len;
 }
+static const struct file_operations xfs_stat_proc_fops = {
+        .owner          = THIS_MODULE,
+        .open           = xfs_stat_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
 int
 xfs_init_procfs(void)
 {
        if (!proc_mkdir("fs/xfs", NULL))
                goto out;
-        if (!create_proc_read_entry("fs/xfs/stat", 0, NULL,
+        if (!proc_create("fs/xfs/stat", 0, NULL,
-                        xfs_read_xfsstats, NULL))
+                         &xfs_stat_proc_fops))
                goto out_remove_entry;
        return 0;
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index a220d36f789b..5d7c60ac77b4 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -579,15 +579,19 @@ xfs_showargs(
        else if (mp->m_qflags & XFS_UQUOTA_ACCT)
                seq_puts(m, "," MNTOPT_UQUOTANOENF);
-        if (mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))
+        /* Either project or group quotas can be active, not both */
-                seq_puts(m, "," MNTOPT_PRJQUOTA);
-        else if (mp->m_qflags & XFS_PQUOTA_ACCT)
+        if (mp->m_qflags & XFS_PQUOTA_ACCT) {
-                seq_puts(m, "," MNTOPT_PQUOTANOENF);
+                if (mp->m_qflags & XFS_OQUOTA_ENFD)
+                        seq_puts(m, "," MNTOPT_PRJQUOTA);
-        if (mp->m_qflags & (XFS_GQUOTA_ACCT|XFS_OQUOTA_ENFD))
+                else
-                seq_puts(m, "," MNTOPT_GRPQUOTA);
+                        seq_puts(m, "," MNTOPT_PQUOTANOENF);
-        else if (mp->m_qflags & XFS_GQUOTA_ACCT)
+        } else if (mp->m_qflags & XFS_GQUOTA_ACCT) {
-                seq_puts(m, "," MNTOPT_GQUOTANOENF);
+                if (mp->m_qflags & XFS_OQUOTA_ENFD)
+                        seq_puts(m, "," MNTOPT_GRPQUOTA);
+                else
+                        seq_puts(m, "," MNTOPT_GQUOTANOENF);
+        }
        if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
                seq_puts(m, "," MNTOPT_NOQUOTA);
@@ -687,7 +691,7 @@ xfs_barrier_test(
        return error;
 }
-void
+STATIC void
 xfs_mountfs_check_barriers(xfs_mount_t *mp)
 {
        int error;
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 98ef624d9baf..320be6aea492 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -749,21 +749,6 @@ __xfs_inode_clear_reclaim_tag(
                        XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
 }
-void
-xfs_inode_clear_reclaim_tag(
-        xfs_inode_t     *ip)
-{
-        xfs_mount_t     *mp = ip->i_mount;
-        xfs_perag_t     *pag = xfs_get_perag(mp, ip->i_ino);
-        read_lock(&pag->pag_ici_lock);
-        spin_lock(&ip->i_flags_lock);
-        __xfs_inode_clear_reclaim_tag(mp, pag, ip);
-        spin_unlock(&ip->i_flags_lock);
-        read_unlock(&pag->pag_ici_lock);
-        xfs_put_perag(mp, pag);
-}
 STATIC int
 xfs_reclaim_inode_now(
        struct xfs_inode        *ip,
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 59120602588a..27920eb7a820 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -49,7 +49,6 @@ int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
 void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
 void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
-void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip);
 void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
                                struct xfs_inode *ip);
diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c
index 21b08c0396a1..83e7ea3e25fa 100644
--- a/fs/xfs/quota/xfs_qm_stats.c
+++ b/fs/xfs/quota/xfs_qm_stats.c
@@ -48,50 +48,34 @@
 struct xqmstats xqmstats;
-STATIC int
+static int xqm_proc_show(struct seq_file *m, void *v)
-xfs_qm_read_xfsquota(
-        char            *buffer,
-        char            **start,
-        off_t           offset,
-        int             count,
-        int             *eof,
-        void            *data)
 {
-        int             len;
        /* maximum; incore; ratio free to inuse; freelist */
-        len = sprintf(buffer, "%d\t%d\t%d\t%u\n",
+        seq_printf(m, "%d\t%d\t%d\t%u\n",
                        ndquot,
                        xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0,
                        xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0,
                        xfs_Gqm? xfs_Gqm->qm_dqfreelist.qh_nelems : 0);
+        return 0;
-        if (offset >= len) {
-                *start = buffer;
-                *eof = 1;
-                return 0;
-        }
-        *start = buffer + offset;
-        if ((len -= offset) > count)
-                return count;
-        *eof = 1;
-        return len;
 }
-STATIC int
+static int xqm_proc_open(struct inode *inode, struct file *file)
-xfs_qm_read_stats(
-        char            *buffer,
-        char            **start,
-        off_t           offset,
-        int             count,
-        int             *eof,
-        void            *data)
 {
-        int             len;
+        return single_open(file, xqm_proc_show, NULL);
+}
+static const struct file_operations xqm_proc_fops = {
+        .owner          = THIS_MODULE,
+        .open           = xqm_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+static int xqmstat_proc_show(struct seq_file *m, void *v)
+{
        /* quota performance statistics */
-        len = sprintf(buffer, "qm %u %u %u %u %u %u %u %u\n",
+        seq_printf(m, "qm %u %u %u %u %u %u %u %u\n",
                        xqmstats.xs_qm_dqreclaims,
                        xqmstats.xs_qm_dqreclaim_misses,
                        xqmstats.xs_qm_dquot_dups,
@@ -100,25 +84,27 @@ xfs_qm_read_stats(
                        xqmstats.xs_qm_dqwants,
                        xqmstats.xs_qm_dqshake_reclaims,
                        xqmstats.xs_qm_dqinact_reclaims);
+        return 0;
+}
-        if (offset >= len) {
+static int xqmstat_proc_open(struct inode *inode, struct file *file)
-                *start = buffer;
+{
-                *eof = 1;
+        return single_open(file, xqmstat_proc_show, NULL);
-                return 0;
-        }
-        *start = buffer + offset;
-        if ((len -= offset) > count)
-                return count;
-        *eof = 1;
-        return len;
 }
+static const struct file_operations xqmstat_proc_fops = {
+        .owner          = THIS_MODULE,
+        .open           = xqmstat_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
 void
 xfs_qm_init_procfs(void)
 {
-        create_proc_read_entry("fs/xfs/xqmstat", 0, NULL, xfs_qm_read_stats, NULL);
+        proc_create("fs/xfs/xqmstat", 0, NULL, &xqmstat_proc_fops);
-        create_proc_read_entry("fs/xfs/xqm", 0, NULL, xfs_qm_read_xfsquota, NULL);
+        proc_create("fs/xfs/xqm", 0, NULL, &xqm_proc_fops);
 }
 void
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index f24b50b68d03..a5d54bf4931b 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -198,6 +198,15 @@ typedef struct xfs_perag
        xfs_agino_t     pagi_count;     /* number of allocated inodes */
        int             pagb_count;     /* pagb slots in use */
        xfs_perag_busy_t *pagb_list;    /* unstable blocks */
+        /*
+         * Inode allocation search lookup optimisation.
+         * If the pagino matches, the search for new inodes
+         * doesn't need to search the near ones again straight away
+         */
+        xfs_agino_t     pagl_pagino;
+        xfs_agino_t     pagl_leftrec;
+        xfs_agino_t     pagl_rightrec;
 #ifdef __KERNEL__
        spinlock_t      pagb_lock;      /* lock for pagb_list */
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 8ee5b5a76a2a..8971fb09d387 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -3713,7 +3713,7 @@ done:
 * entry (null if none).  Else, *lastxp will be set to the index
 * of the found entry; *gotp will contain the entry.
 */
-xfs_bmbt_rec_host_t *                   /* pointer to found extent entry */
+STATIC xfs_bmbt_rec_host_t *            /* pointer to found extent entry */
 xfs_bmap_search_multi_extents(
        xfs_ifork_t     *ifp,           /* inode fork pointer */
        xfs_fileoff_t   bno,            /* block number searched for */
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 1b8ff9256bd0..56f62d2edc35 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -392,17 +392,6 @@ xfs_bmap_count_blocks(
        int                     whichfork,
        int                     *count);
-/*
- * Search the extent records for the entry containing block bno.
- * If bno lies in a hole, point to the next entry.  If bno lies
- * past eof, *eofp will be set, and *prevp will contain the last
- * entry (null if none).  Else, *lastxp will be set to the index
- * of the found entry; *gotp will contain the entry.
- */
-xfs_bmbt_rec_host_t *
-xfs_bmap_search_multi_extents(struct xfs_ifork *, xfs_fileoff_t, int *,
-                        xfs_extnum_t *, xfs_bmbt_irec_t *, xfs_bmbt_irec_t *);
 #endif  /* __KERNEL__ */
 #endif  /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 5c1ade06578e..eb7b702d0690 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -202,16 +202,6 @@ xfs_bmbt_get_state(
                                ext_flag);
 }
-/* Endian flipping versions of the bmbt extraction functions */
-void
-xfs_bmbt_disk_get_all(
-        xfs_bmbt_rec_t  *r,
-        xfs_bmbt_irec_t *s)
-{
-        __xfs_bmbt_get_all(get_unaligned_be64(&r->l0),
-                                get_unaligned_be64(&r->l1), s);
-}
 /*
 * Extract the blockcount field from an on disk bmap extent record.
 */
@@ -816,6 +806,16 @@ xfs_bmbt_trace_key(
        *l1 = 0;
 }
+/* Endian flipping versions of the bmbt extraction functions */
+STATIC void
+xfs_bmbt_disk_get_all(
+        xfs_bmbt_rec_t  *r,
+        xfs_bmbt_irec_t *s)
+{
+        __xfs_bmbt_get_all(get_unaligned_be64(&r->l0),
+                                get_unaligned_be64(&r->l1), s);
+}
 STATIC void
 xfs_bmbt_trace_record(
        struct xfs_btree_cur    *cur,
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 0e8df007615e..5549d495947f 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -220,7 +220,6 @@ extern xfs_fsblock_t xfs_bmbt_get_startblock(xfs_bmbt_rec_host_t *r);
 extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_host_t *r);
 extern xfs_exntst_t xfs_bmbt_get_state(xfs_bmbt_rec_host_t *r);
-extern void xfs_bmbt_disk_get_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s);
 extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r);
 extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r);
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 26717388acf5..52b5f14d0c32 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -646,46 +646,6 @@ xfs_btree_read_bufl(
 }
 /*
- * Get a buffer for the block, return it read in.
- * Short-form addressing.
- */
-int                                     /* error */
-xfs_btree_read_bufs(
-        xfs_mount_t     *mp,            /* file system mount point */
-        xfs_trans_t     *tp,            /* transaction pointer */
-        xfs_agnumber_t  agno,           /* allocation group number */
-        xfs_agblock_t   agbno,          /* allocation group block number */
-        uint            lock,           /* lock flags for read_buf */
-        xfs_buf_t       **bpp,          /* buffer for agno/agbno */
-        int             refval)         /* ref count value for buffer */
-{
-        xfs_buf_t       *bp;            /* return value */
-        xfs_daddr_t     d;              /* real disk block address */
-        int             error;
-        ASSERT(agno != NULLAGNUMBER);
-        ASSERT(agbno != NULLAGBLOCK);
-        d = XFS_AGB_TO_DADDR(mp, agno, agbno);
-        if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
-                                        mp->m_bsize, lock, &bp))) {
-                return error;
-        }
-        ASSERT(!bp || !XFS_BUF_GETERROR(bp));
-        if (bp != NULL) {
-                switch (refval) {
-                case XFS_ALLOC_BTREE_REF:
-                        XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
-                        break;
-                case XFS_INO_BTREE_REF:
-                        XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, refval);
-                        break;
-                }
-        }
-        *bpp = bp;
-        return 0;
-}
-/*
 * Read-ahead the block, don't wait for it, don't return a buffer.
 * Long-form addressing.
 */
@@ -2951,7 +2911,7 @@ error0:
 * inode we have to copy the single block it was pointing to into the
 * inode.
 */
-int
+STATIC int
 xfs_btree_kill_iroot(
        struct xfs_btree_cur    *cur)
 {
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 4f852b735b96..7fa07062bdda 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -379,20 +379,6 @@ xfs_btree_read_bufl(
        int                     refval);/* ref count value for buffer */
 /*
- * Get a buffer for the block, return it read in.
- * Short-form addressing.
- */
-int                                     /* error */
-xfs_btree_read_bufs(
-        struct xfs_mount        *mp,    /* file system mount point */
-        struct xfs_trans        *tp,    /* transaction pointer */
-        xfs_agnumber_t          agno,   /* allocation group number */
-        xfs_agblock_t           agbno,  /* allocation group block number */
-        uint                    lock,   /* lock flags for read_buf */
-        struct xfs_buf          **bpp,  /* buffer for agno/agbno */
-        int                     refval);/* ref count value for buffer */
-/*
 * Read-ahead the block, don't wait for it, don't return a buffer.
 * Long-form addressing.
 */
@@ -432,7 +418,6 @@ int xfs_btree_decrement(struct xfs_btree_cur *, int, int *);
 int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *);
 int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *);
 int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *);
-int xfs_btree_kill_iroot(struct xfs_btree_cur *);
 int xfs_btree_insert(struct xfs_btree_cur *, int *);
 int xfs_btree_delete(struct xfs_btree_cur *, int *);
 int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *);
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 3120a3a5e20f..ab64f3efb43b 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -57,75 +57,35 @@ xfs_ialloc_cluster_alignment(
 }
 /*
- * Lookup the record equal to ino in the btree given by cur.
+ * Lookup a record by ino in the btree given by cur.
- */
-STATIC int                              /* error */
-xfs_inobt_lookup_eq(
-        struct xfs_btree_cur    *cur,   /* btree cursor */
-        xfs_agino_t             ino,    /* starting inode of chunk */
-        __int32_t               fcnt,   /* free inode count */
-        xfs_inofree_t           free,   /* free inode mask */
-        int                     *stat)  /* success/failure */
-{
-        cur->bc_rec.i.ir_startino = ino;
-        cur->bc_rec.i.ir_freecount = fcnt;
-        cur->bc_rec.i.ir_free = free;
-        return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
-}
-/*
- * Lookup the first record greater than or equal to ino
- * in the btree given by cur.
 */
 int                                     /* error */
-xfs_inobt_lookup_ge(
+xfs_inobt_lookup(
        struct xfs_btree_cur    *cur,   /* btree cursor */
        xfs_agino_t             ino,    /* starting inode of chunk */
-        __int32_t               fcnt,   /* free inode count */
+        xfs_lookup_t            dir,    /* <=, >=, == */
-        xfs_inofree_t           free,   /* free inode mask */
        int                     *stat)  /* success/failure */
 {
        cur->bc_rec.i.ir_startino = ino;
-        cur->bc_rec.i.ir_freecount = fcnt;
+        cur->bc_rec.i.ir_freecount = 0;
-        cur->bc_rec.i.ir_free = free;
+        cur->bc_rec.i.ir_free = 0;
-        return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+        return xfs_btree_lookup(cur, dir, stat);
 }
 /*
- * Lookup the first record less than or equal to ino
+ * Update the record referred to by cur to the value given.
- * in the btree given by cur.
- */
-int                                     /* error */
-xfs_inobt_lookup_le(
-        struct xfs_btree_cur    *cur,   /* btree cursor */
-        xfs_agino_t             ino,    /* starting inode of chunk */
-        __int32_t               fcnt,   /* free inode count */
-        xfs_inofree_t           free,   /* free inode mask */
-        int                     *stat)  /* success/failure */
-{
-        cur->bc_rec.i.ir_startino = ino;
-        cur->bc_rec.i.ir_freecount = fcnt;
-        cur->bc_rec.i.ir_free = free;
-        return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
-}
-/*
- * Update the record referred to by cur to the value given
- * by [ino, fcnt, free].
 * This either works (return 0) or gets an EFSCORRUPTED error.
 */
 STATIC int                              /* error */
 xfs_inobt_update(
        struct xfs_btree_cur    *cur,   /* btree cursor */
-        xfs_agino_t             ino,    /* starting inode of chunk */
+        xfs_inobt_rec_incore_t  *irec)  /* btree record */
-        __int32_t               fcnt,   /* free inode count */
-        xfs_inofree_t           free)   /* free inode mask */
 {
        union xfs_btree_rec     rec;
-        rec.inobt.ir_startino = cpu_to_be32(ino);
+        rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino);
-        rec.inobt.ir_freecount = cpu_to_be32(fcnt);
+        rec.inobt.ir_freecount = cpu_to_be32(irec->ir_freecount);
-        rec.inobt.ir_free = cpu_to_be64(free);
+        rec.inobt.ir_free = cpu_to_be64(irec->ir_free);
        return xfs_btree_update(cur, &rec);
 }
@@ -135,9 +95,7 @@ xfs_inobt_update(
 int                                     /* error */
 xfs_inobt_get_rec(
        struct xfs_btree_cur    *cur,   /* btree cursor */
-        xfs_agino_t             *ino,   /* output: starting inode of chunk */
+        xfs_inobt_rec_incore_t  *irec,  /* btree record */
-        __int32_t               *fcnt,  /* output: number of free inodes */
-        xfs_inofree_t           *free,  /* output: free inode mask */
        int                     *stat)  /* output: success/failure */
 {
        union xfs_btree_rec     *rec;
@@ -145,14 +103,136 @@ xfs_inobt_get_rec(
        error = xfs_btree_get_rec(cur, &rec, stat);
        if (!error && *stat == 1) {
-                *ino = be32_to_cpu(rec->inobt.ir_startino);
+                irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
-                *fcnt = be32_to_cpu(rec->inobt.ir_freecount);
+                irec->ir_freecount = be32_to_cpu(rec->inobt.ir_freecount);
-                *free = be64_to_cpu(rec->inobt.ir_free);
+                irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
        }
        return error;
 }
 /*
+ * Verify that the number of free inodes in the AGI is correct.
+ */
+#ifdef DEBUG
+STATIC int
+xfs_check_agi_freecount(
+        struct xfs_btree_cur    *cur,
+        struct xfs_agi          *agi)
+{
+        if (cur->bc_nlevels == 1) {
+                xfs_inobt_rec_incore_t rec;
+                int             freecount = 0;
+                int             error;
+                int             i;
+                error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+                if (error)
+                        return error;
+                do {
+                        error = xfs_inobt_get_rec(cur, &rec, &i);
+                        if (error)
+                                return error;
+                        if (i) {
+                                freecount += rec.ir_freecount;
+                                error = xfs_btree_increment(cur, 0, &i);
+                                if (error)
+                                        return error;
+                        }
+                } while (i == 1);
+                if (!XFS_FORCED_SHUTDOWN(cur->bc_mp))
+                        ASSERT(freecount == be32_to_cpu(agi->agi_freecount));
+        }
+        return 0;
+}
+#else
+#define xfs_check_agi_freecount(cur, agi)       0
+#endif
+/*
+ * Initialise a new set of inodes.
+ */
+STATIC void
+xfs_ialloc_inode_init(
+        struct xfs_mount        *mp,
+        struct xfs_trans        *tp,
+        xfs_agnumber_t          agno,
+        xfs_agblock_t           agbno,
+        xfs_agblock_t           length,
+        unsigned int            gen)
+{
+        struct xfs_buf          *fbuf;
+        struct xfs_dinode       *free;
+        int                     blks_per_cluster, nbufs, ninodes;
+        int                     version;
+        int                     i, j;
+        xfs_daddr_t             d;
+        /*
+         * Loop over the new block(s), filling in the inodes.
+         * For small block sizes, manipulate the inodes in buffers
+         * which are multiples of the blocks size.
+         */
+        if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
+                blks_per_cluster = 1;
+                nbufs = length;
+                ninodes = mp->m_sb.sb_inopblock;
+        } else {
+                blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) /
+                                   mp->m_sb.sb_blocksize;
+                nbufs = length / blks_per_cluster;
+                ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
+        }
+        /*
+         * Figure out what version number to use in the inodes we create.
+         * If the superblock version has caught up to the one that supports
+         * the new inode format, then use the new inode version.  Otherwise
+         * use the old version so that old kernels will continue to be
+         * able to use the file system.
+         */
+        if (xfs_sb_version_hasnlink(&mp->m_sb))
+                version = 2;
+        else
+                version = 1;
+        for (j = 0; j < nbufs; j++) {
+                /*
+                 * Get the block.
+                 */
+                d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
+                fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+                                         mp->m_bsize * blks_per_cluster,
+                                         XFS_BUF_LOCK);
+                ASSERT(fbuf);
+                ASSERT(!XFS_BUF_GETERROR(fbuf));
+                /*
+                 * Initialize all inodes in this buffer and then log them.
+                 *
+                 * XXX: It would be much better if we had just one transaction
+                 *      to log a whole cluster of inodes instead of all the
+                 *      individual transactions causing a lot of log traffic.
+                 */
+                xfs_biozero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog);
+                for (i = 0; i < ninodes; i++) {
+                        int     ioffset = i << mp->m_sb.sb_inodelog;
+                        uint    isize = sizeof(struct xfs_dinode);
+                        free = xfs_make_iptr(mp, fbuf, i);
+                        free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
+                        free->di_version = version;
+                        free->di_gen = cpu_to_be32(gen);
+                        free->di_next_unlinked = cpu_to_be32(NULLAGINO);
+                        xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1);
+                }
+                xfs_trans_inode_alloc_buf(tp, fbuf);
+        }
+}
+/*
 * Allocate new inodes in the allocation group specified by agbp.
 * Return 0 for success, else error code.
 */
@@ -164,24 +244,15 @@ xfs_ialloc_ag_alloc(
 {
        xfs_agi_t       *agi;           /* allocation group header */
        xfs_alloc_arg_t args;           /* allocation argument structure */
-        int             blks_per_cluster;  /* fs blocks per inode cluster */
        xfs_btree_cur_t *cur;           /* inode btree cursor */
-        xfs_daddr_t     d;              /* disk addr of buffer */
        xfs_agnumber_t  agno;
        int             error;
-        xfs_buf_t       *fbuf;          /* new free inodes' buffer */
+        int             i;
-        xfs_dinode_t    *free;          /* new free inode structure */
-        int             i;              /* inode counter */
-        int             j;              /* block counter */
-        int             nbufs;          /* num bufs of new inodes */
        xfs_agino_t     newino;         /* new first inode's number */
        xfs_agino_t     newlen;         /* new number of inodes */
-        int             ninodes;        /* num inodes per buf */
        xfs_agino_t     thisino;        /* current inode number, for loop */
-        int             version;        /* inode version number to use */
        int             isaligned = 0;  /* inode allocation at stripe unit */
                                        /* boundary */
-        unsigned int    gen;
        args.tp = tp;
        args.mp = tp->t_mountp;
@@ -202,12 +273,12 @@ xfs_ialloc_ag_alloc(
         */
        agi = XFS_BUF_TO_AGI(agbp);
        newino = be32_to_cpu(agi->agi_newino);
+        agno = be32_to_cpu(agi->agi_seqno);
        args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
                        XFS_IALLOC_BLOCKS(args.mp);
        if (likely(newino != NULLAGINO &&
                  (args.agbno < be32_to_cpu(agi->agi_length)))) {
-                args.fsbno = XFS_AGB_TO_FSB(args.mp,
+                args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
-                                be32_to_cpu(agi->agi_seqno), args.agbno);
                args.type = XFS_ALLOCTYPE_THIS_BNO;
                args.mod = args.total = args.wasdel = args.isfl =
                        args.userdata = args.minalignslop = 0;
@@ -258,8 +329,7 @@ xfs_ialloc_ag_alloc(
                 * For now, just allocate blocks up front.
                 */
                args.agbno = be32_to_cpu(agi->agi_root);
-                args.fsbno = XFS_AGB_TO_FSB(args.mp,
+                args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
-                                be32_to_cpu(agi->agi_seqno), args.agbno);
                /*
                 * Allocate a fixed-size extent of inodes.
                 */
@@ -282,8 +352,7 @@ xfs_ialloc_ag_alloc(
        if (isaligned && args.fsbno == NULLFSBLOCK) {
                args.type = XFS_ALLOCTYPE_NEAR_BNO;
                args.agbno = be32_to_cpu(agi->agi_root);
-                args.fsbno = XFS_AGB_TO_FSB(args.mp,
+                args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
-                                be32_to_cpu(agi->agi_seqno), args.agbno);
                args.alignment = xfs_ialloc_cluster_alignment(&args);
                if ((error = xfs_alloc_vextent(&args)))
                        return error;
@@ -294,85 +363,30 @@ xfs_ialloc_ag_alloc(
                return 0;
        }
        ASSERT(args.len == args.minlen);
-        /*
-         * Convert the results.
-         */
-        newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
-        /*
-         * Loop over the new block(s), filling in the inodes.
-         * For small block sizes, manipulate the inodes in buffers
-         * which are multiples of the blocks size.
-         */
-        if (args.mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(args.mp)) {
-                blks_per_cluster = 1;
-                nbufs = (int)args.len;
-                ninodes = args.mp->m_sb.sb_inopblock;
-        } else {
-                blks_per_cluster = XFS_INODE_CLUSTER_SIZE(args.mp) /
-                                   args.mp->m_sb.sb_blocksize;
-                nbufs = (int)args.len / blks_per_cluster;
-                ninodes = blks_per_cluster * args.mp->m_sb.sb_inopblock;
-        }
-        /*
-         * Figure out what version number to use in the inodes we create.
-         * If the superblock version has caught up to the one that supports
-         * the new inode format, then use the new inode version.  Otherwise
-         * use the old version so that old kernels will continue to be
-         * able to use the file system.
-         */
-        if (xfs_sb_version_hasnlink(&args.mp->m_sb))
-                version = 2;
-        else
-                version = 1;
        /*
+         * Stamp and write the inode buffers.
+         *
         * Seed the new inode cluster with a random generation number. This
         * prevents short-term reuse of generation numbers if a chunk is
         * freed and then immediately reallocated. We use random numbers
         * rather than a linear progression to prevent the next generation
         * number from being easily guessable.
         */
-        gen = random32();
+        xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno, args.len,
-        for (j = 0; j < nbufs; j++) {
+                              random32());
-                /*
-                 * Get the block.
-                 */
-                d = XFS_AGB_TO_DADDR(args.mp, be32_to_cpu(agi->agi_seqno),
-                                     args.agbno + (j * blks_per_cluster));
-                fbuf = xfs_trans_get_buf(tp, args.mp->m_ddev_targp, d,
-                                         args.mp->m_bsize * blks_per_cluster,
-                                         XFS_BUF_LOCK);
-                ASSERT(fbuf);
-                ASSERT(!XFS_BUF_GETERROR(fbuf));
-                /*
+        /*
-                 * Initialize all inodes in this buffer and then log them.
+         * Convert the results.
-                 *
+         */
-                 * XXX: It would be much better if we had just one transaction to
+        newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
-                 *      log a whole cluster of inodes instead of all the individual
-                 *      transactions causing a lot of log traffic.
-                 */
-                xfs_biozero(fbuf, 0, ninodes << args.mp->m_sb.sb_inodelog);
-                for (i = 0; i < ninodes; i++) {
-                        int     ioffset = i << args.mp->m_sb.sb_inodelog;
-                        uint    isize = sizeof(struct xfs_dinode);
-                        free = xfs_make_iptr(args.mp, fbuf, i);
-                        free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
-                        free->di_version = version;
-                        free->di_gen = cpu_to_be32(gen);
-                        free->di_next_unlinked = cpu_to_be32(NULLAGINO);
-                        xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1);
-                }
-                xfs_trans_inode_alloc_buf(tp, fbuf);
-        }
        be32_add_cpu(&agi->agi_count, newlen);
        be32_add_cpu(&agi->agi_freecount, newlen);
-        agno = be32_to_cpu(agi->agi_seqno);
        down_read(&args.mp->m_peraglock);
        args.mp->m_perag[agno].pagi_freecount += newlen;
        up_read(&args.mp->m_peraglock);
        agi->agi_newino = cpu_to_be32(newino);
        /*
         * Insert records describing the new inode chunk into the btree.
         */
@@ -380,13 +394,17 @@ xfs_ialloc_ag_alloc(
        for (thisino = newino;
             thisino < newino + newlen;
             thisino += XFS_INODES_PER_CHUNK) {
-                if ((error = xfs_inobt_lookup_eq(cur, thisino,
+                cur->bc_rec.i.ir_startino = thisino;
-                                XFS_INODES_PER_CHUNK, XFS_INOBT_ALL_FREE, &i))) {
+                cur->bc_rec.i.ir_freecount = XFS_INODES_PER_CHUNK;
+                cur->bc_rec.i.ir_free = XFS_INOBT_ALL_FREE;
+                error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, &i);
+                if (error) {
                        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
                        return error;
                }
                ASSERT(i == 0);
-                if ((error = xfs_btree_insert(cur, &i))) {
+                error = xfs_btree_insert(cur, &i);
+                if (error) {
                        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
                        return error;
                }
@@ -539,6 +557,62 @@ nextag:
 }
 /*
+ * Try to retrieve the next record to the left/right from the current one.
+ */
+STATIC int
+xfs_ialloc_next_rec(
+        struct xfs_btree_cur    *cur,
+        xfs_inobt_rec_incore_t  *rec,
+        int                     *done,
+        int                     left)
+{
+        int                     error;
+        int                     i;
+        if (left)
+                error = xfs_btree_decrement(cur, 0, &i);
+        else
+                error = xfs_btree_increment(cur, 0, &i);
+        if (error)
+                return error;
+        *done = !i;
+        if (i) {
+                error = xfs_inobt_get_rec(cur, rec, &i);
+                if (error)
+                        return error;
+                XFS_WANT_CORRUPTED_RETURN(i == 1);
+        }
+        return 0;
+}
+STATIC int
+xfs_ialloc_get_rec(
+        struct xfs_btree_cur    *cur,
+        xfs_agino_t             agino,
+        xfs_inobt_rec_incore_t  *rec,
+        int                     *done,
+        int                     left)
+{
+        int                     error;
+        int                     i;
+        error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_EQ, &i);
+        if (error)
+                return error;
+        *done = !i;
+        if (i) {
+                error = xfs_inobt_get_rec(cur, rec, &i);
+                if (error)
+                        return error;
+                XFS_WANT_CORRUPTED_RETURN(i == 1);
+        }
+        return 0;
+}
+/*
 * Visible inode allocation functions.
 */
@@ -592,8 +666,8 @@ xfs_dialloc(
        int             j;              /* result code */
        xfs_mount_t     *mp;            /* file system mount structure */
        int             offset;         /* index of inode in chunk */
-        xfs_agino_t     pagino;         /* parent's a.g. relative inode # */
+        xfs_agino_t     pagino;         /* parent's AG relative inode # */
-        xfs_agnumber_t  pagno;          /* parent's allocation group number */
+        xfs_agnumber_t  pagno;          /* parent's AG number */
        xfs_inobt_rec_incore_t rec;     /* inode allocation record */
        xfs_agnumber_t  tagno;          /* testing allocation group number */
        xfs_btree_cur_t *tcur;          /* temp cursor */
@@ -716,6 +790,8 @@ nextag:
         */
        agno = tagno;
        *IO_agbp = NULL;
+ restart_pagno:
        cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno));
        /*
         * If pagino is 0 (this is the root inode allocation) use newino.
@@ -723,220 +799,199 @@ nextag:
         */
        if (!pagino)
                pagino = be32_to_cpu(agi->agi_newino);
-#ifdef DEBUG
-        if (cur->bc_nlevels == 1) {
-                int     freecount = 0;
-                if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
+        error = xfs_check_agi_freecount(cur, agi);
-                        goto error0;
+        if (error)
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                goto error0;
-                do {
-                        if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
-                                        &rec.ir_freecount, &rec.ir_free, &i)))
-                                goto error0;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                        freecount += rec.ir_freecount;
-                        if ((error = xfs_btree_increment(cur, 0, &i)))
-                                goto error0;
-                } while (i == 1);
-                ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
-                       XFS_FORCED_SHUTDOWN(mp));
-        }
-#endif
        /*
-         * If in the same a.g. as the parent, try to get near the parent.
+         * If in the same AG as the parent, try to get near the parent.
         */
        if (pagno == agno) {
-                if ((error = xfs_inobt_lookup_le(cur, pagino, 0, 0, &i)))
+                xfs_perag_t     *pag = &mp->m_perag[agno];
+                int             doneleft;       /* done, to the left */
+                int             doneright;      /* done, to the right */
+                int             searchdistance = 10;
+                error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
+                if (error)
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                error = xfs_inobt_get_rec(cur, &rec, &j);
+                if (error)
                        goto error0;
-                if (i != 0 &&
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                    (error = xfs_inobt_get_rec(cur, &rec.ir_startino,
-                            &rec.ir_freecount, &rec.ir_free, &j)) == 0 &&
+                if (rec.ir_freecount > 0) {
-                    j == 1 &&
-                    rec.ir_freecount > 0) {
                        /*
                         * Found a free inode in the same chunk
-                         * as parent, done.
+                         * as the parent, done.
                         */
+                        goto alloc_inode;
                }
+                /*
+                 * In the same AG as parent, but parent's chunk is full.
+                 */
+                /* duplicate the cursor, search left & right simultaneously */
+                error = xfs_btree_dup_cursor(cur, &tcur);
+                if (error)
+                        goto error0;
                /*
-                 * In the same a.g. as parent, but parent's chunk is full.
+                 * Skip to last blocks looked up if same parent inode.
                 */
-                else {
+                if (pagino != NULLAGINO &&
-                        int     doneleft;       /* done, to the left */
+                    pag->pagl_pagino == pagino &&
-                        int     doneright;      /* done, to the right */
+                    pag->pagl_leftrec != NULLAGINO &&
+                    pag->pagl_rightrec != NULLAGINO) {
+                        error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec,
+                                                   &trec, &doneleft, 1);
+                        if (error)
+                                goto error1;
+                        error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec,
+                                                   &rec, &doneright, 0);
                        if (error)
-                                goto error0;
-                        ASSERT(i == 1);
-                        ASSERT(j == 1);
-                        /*
-                         * Duplicate the cursor, search left & right
-                         * simultaneously.
-                         */
-                        if ((error = xfs_btree_dup_cursor(cur, &tcur)))
-                                goto error0;
-                        /*
-                         * Search left with tcur, back up 1 record.
-                         */
-                        if ((error = xfs_btree_decrement(tcur, 0, &i)))
                                goto error1;
-                        doneleft = !i;
+                } else {
-                        if (!doneleft) {
+                        /* search left with tcur, back up 1 record */
-                                if ((error = xfs_inobt_get_rec(tcur,
+                        error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1);
-                                                &trec.ir_startino,
+                        if (error)
-                                                &trec.ir_freecount,
-                                                &trec.ir_free, &i)))
-                                        goto error1;
-                                XFS_WANT_CORRUPTED_GOTO(i == 1, error1);
-                        }
-                        /*
-                         * Search right with cur, go forward 1 record.
-                         */
-                        if ((error = xfs_btree_increment(cur, 0, &i)))
                                goto error1;
-                        doneright = !i;
-                        if (!doneright) {
-                                if ((error = xfs_inobt_get_rec(cur,
-                                                &rec.ir_startino,
-                                                &rec.ir_freecount,
-                                                &rec.ir_free, &i)))
-                                        goto error1;
-                                XFS_WANT_CORRUPTED_GOTO(i == 1, error1);
-                        }
-                        /*
-                         * Loop until we find the closest inode chunk
-                         * with a free one.
-                         */
-                        while (!doneleft || !doneright) {
-                                int     useleft;  /* using left inode
-                                                     chunk this time */
+                        /* search right with cur, go forward 1 record. */
+                        error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0);
+                        if (error)
+                                goto error1;
+                }
+                /*
+                 * Loop until we find an inode chunk with a free inode.
+                 */
+                while (!doneleft || !doneright) {
+                        int     useleft;  /* using left inode chunk this time */
+                        if (!--searchdistance) {
                                /*
-                                 * Figure out which block is closer,
+                                 * Not in range - save last search
-                                 * if both are valid.
+                                 * location and allocate a new inode
-                                 */
-                                if (!doneleft && !doneright)
-                                        useleft =
-                                                pagino -
-                                                (trec.ir_startino +
-                                                 XFS_INODES_PER_CHUNK - 1) <
-                                                 rec.ir_startino - pagino;
-                                else
-                                        useleft = !doneleft;
-                                /*
-                                 * If checking the left, does it have
-                                 * free inodes?
-                                 */
-                                if (useleft && trec.ir_freecount) {
-                                        /*
-                                         * Yes, set it up as the chunk to use.
-                                         */
-                                        rec = trec;
-                                        xfs_btree_del_cursor(cur,
-                                                XFS_BTREE_NOERROR);
-                                        cur = tcur;
-                                        break;
-                                }
-                                /*
-                                 * If checking the right, does it have
-                                 * free inodes?
-                                 */
-                                if (!useleft && rec.ir_freecount) {
-                                        /*
-                                         * Yes, it's already set up.
-                                         */
-                                        xfs_btree_del_cursor(tcur,
-                                                XFS_BTREE_NOERROR);
-                                        break;
-                                }
-                                /*
-                                 * If used the left, get another one
-                                 * further left.
-                                 */
-                                if (useleft) {
-                                        if ((error = xfs_btree_decrement(tcur, 0,
-                                                        &i)))
-                                                goto error1;
-                                        doneleft = !i;
-                                        if (!doneleft) {
-                                                if ((error = xfs_inobt_get_rec(
-                                                            tcur,
-                                                            &trec.ir_startino,
-                                                            &trec.ir_freecount,
-                                                            &trec.ir_free, &i)))
-                                                        goto error1;
-                                                XFS_WANT_CORRUPTED_GOTO(i == 1,
-                                                        error1);
-                                        }
-                                }
-                                /*
-                                 * If used the right, get another one
-                                 * further right.
                                 */
-                                else {
+                                pag->pagl_leftrec = trec.ir_startino;
-                                        if ((error = xfs_btree_increment(cur, 0,
+                                pag->pagl_rightrec = rec.ir_startino;
-                                                        &i)))
+                                pag->pagl_pagino = pagino;
-                                                goto error1;
+                                goto newino;
-                                        doneright = !i;
+                        }
-                                        if (!doneright) {
-                                                if ((error = xfs_inobt_get_rec(
+                        /* figure out the closer block if both are valid. */
-                                                            cur,
+                        if (!doneleft && !doneright) {
-                                                            &rec.ir_startino,
+                                useleft = pagino -
-                                                            &rec.ir_freecount,
+                                 (trec.ir_startino + XFS_INODES_PER_CHUNK - 1) <
-                                                            &rec.ir_free, &i)))
+                                  rec.ir_startino - pagino;
-                                                        goto error1;
+                        } else {
-                                                XFS_WANT_CORRUPTED_GOTO(i == 1,
+                                useleft = !doneleft;
-                                                        error1);
-                                        }
-                                }
                        }
-                        ASSERT(!doneleft || !doneright);
+                        /* free inodes to the left? */
+                        if (useleft && trec.ir_freecount) {
+                                rec = trec;
+                                xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+                                cur = tcur;
+                                pag->pagl_leftrec = trec.ir_startino;
+                                pag->pagl_rightrec = rec.ir_startino;
+                                pag->pagl_pagino = pagino;
+                                goto alloc_inode;
+                        }
+                        /* free inodes to the right? */
+                        if (!useleft && rec.ir_freecount) {
+                                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+                                pag->pagl_leftrec = trec.ir_startino;
+                                pag->pagl_rightrec = rec.ir_startino;
+                                pag->pagl_pagino = pagino;
+                                goto alloc_inode;
+                        }
+                        /* get next record to check */
+                        if (useleft) {
+                                error = xfs_ialloc_next_rec(tcur, &trec,
+                                                                 &doneleft, 1);
+                        } else {
+                                error = xfs_ialloc_next_rec(cur, &rec,
+                                                                 &doneright, 0);
+                        }
+                        if (error)
+                                goto error1;
                }
+                /*
+                 * We've reached the end of the btree. because
+                 * we are only searching a small chunk of the
+                 * btree each search, there is obviously free
+                 * inodes closer to the parent inode than we
+                 * are now. restart the search again.
+                 */
+                pag->pagl_pagino = NULLAGINO;
+                pag->pagl_leftrec = NULLAGINO;
+                pag->pagl_rightrec = NULLAGINO;
+                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+                xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+                goto restart_pagno;
        }
        /*
-         * In a different a.g. from the parent.
+         * In a different AG from the parent.
         * See if the most recently allocated block has any free.
         */
-        else if (be32_to_cpu(agi->agi_newino) != NULLAGINO) {
+newino:
-                if ((error = xfs_inobt_lookup_eq(cur,
+        if (be32_to_cpu(agi->agi_newino) != NULLAGINO) {
-                                be32_to_cpu(agi->agi_newino), 0, 0, &i)))
+                error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
+                                         XFS_LOOKUP_EQ, &i);
+                if (error)
                        goto error0;
-                if (i == 1 &&
-                    (error = xfs_inobt_get_rec(cur, &rec.ir_startino,
+                if (i == 1) {
-                            &rec.ir_freecount, &rec.ir_free, &j)) == 0 &&
+                        error = xfs_inobt_get_rec(cur, &rec, &j);
-                    j == 1 &&
-                    rec.ir_freecount > 0) {
-                        /*
-                         * The last chunk allocated in the group still has
-                         * a free inode.
-                         */
-                }
-                /*
-                 * None left in the last group, search the whole a.g.
-                 */
-                else {
                        if (error)
                                goto error0;
-                        if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
-                                goto error0;
+                        if (j == 1 && rec.ir_freecount > 0) {
-                        ASSERT(i == 1);
+                                /*
-                        for (;;) {
+                                 * The last chunk allocated in the group
-                                if ((error = xfs_inobt_get_rec(cur,
+                                 * still has a free inode.
-                                                &rec.ir_startino,
+                                 */
-                                                &rec.ir_freecount, &rec.ir_free,
+                                goto alloc_inode;
-                                                &i)))
-                                        goto error0;
-                                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                                if (rec.ir_freecount > 0)
-                                        break;
-                                if ((error = xfs_btree_increment(cur, 0, &i)))
-                                        goto error0;
-                                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                        }
                }
        }
+        /*
+         * None left in the last group, search the whole AG
+         */
+        error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+        if (error)
+                goto error0;
+        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+        for (;;) {
+                error = xfs_inobt_get_rec(cur, &rec, &i);
+                if (error)
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                if (rec.ir_freecount > 0)
+                        break;
+                error = xfs_btree_increment(cur, 0, &i);
+                if (error)
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+        }
+alloc_inode:
        offset = xfs_ialloc_find_free(&rec.ir_free);
        ASSERT(offset >= 0);
        ASSERT(offset < XFS_INODES_PER_CHUNK);
@@ -945,33 +1000,19 @@ nextag:
        ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
        rec.ir_free &= ~XFS_INOBT_MASK(offset);
        rec.ir_freecount--;
-        if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount,
+        error = xfs_inobt_update(cur, &rec);
-                        rec.ir_free)))
+        if (error)
                goto error0;
        be32_add_cpu(&agi->agi_freecount, -1);
        xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
        down_read(&mp->m_peraglock);
        mp->m_perag[tagno].pagi_freecount--;
        up_read(&mp->m_peraglock);
-#ifdef DEBUG
-        if (cur->bc_nlevels == 1) {
-                int     freecount = 0;
-                if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
+        error = xfs_check_agi_freecount(cur, agi);
-                        goto error0;
+        if (error)
-                do {
+                goto error0;
-                        if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
-                                        &rec.ir_freecount, &rec.ir_free, &i)))
-                                goto error0;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                        freecount += rec.ir_freecount;
-                        if ((error = xfs_btree_increment(cur, 0, &i)))
-                                goto error0;
-                } while (i == 1);
-                ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
-                       XFS_FORCED_SHUTDOWN(mp));
-        }
-#endif
        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
        xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
        *inop = ino;
@@ -1062,38 +1103,23 @@ xfs_difree(
         * Initialize the cursor.
         */
        cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
-#ifdef DEBUG
-        if (cur->bc_nlevels == 1) {
-                int freecount = 0;
-                if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
+        error = xfs_check_agi_freecount(cur, agi);
-                        goto error0;
+        if (error)
-                do {
+                goto error0;
-                        if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
-                                        &rec.ir_freecount, &rec.ir_free, &i)))
-                                goto error0;
-                        if (i) {
-                                freecount += rec.ir_freecount;
-                                if ((error = xfs_btree_increment(cur, 0, &i)))
-                                        goto error0;
-                        }
-                } while (i == 1);
-                ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
-                       XFS_FORCED_SHUTDOWN(mp));
-        }
-#endif
        /*
         * Look for the entry describing this inode.
         */
-        if ((error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i))) {
+        if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) {
                cmn_err(CE_WARN,
-                        "xfs_difree: xfs_inobt_lookup_le returned()  an error %d on %s.  Returning error.",
+                        "xfs_difree: xfs_inobt_lookup returned()  an error %d on %s.  Returning error.",
                        error, mp->m_fsname);
                goto error0;
        }
        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-        if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino, &rec.ir_freecount,
+        error = xfs_inobt_get_rec(cur, &rec, &i);
-                        &rec.ir_free, &i))) {
+        if (error) {
                cmn_err(CE_WARN,
                        "xfs_difree: xfs_inobt_get_rec()  returned an error %d on %s.  Returning error.",
                        error, mp->m_fsname);
@@ -1148,12 +1174,14 @@ xfs_difree(
        } else {
                *delete = 0;
-                if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount, rec.ir_free))) {
+                error = xfs_inobt_update(cur, &rec);
+                if (error) {
                        cmn_err(CE_WARN,
-                                "xfs_difree: xfs_inobt_update()  returned an error %d on %s.  Returning error.",
+        "xfs_difree: xfs_inobt_update returned an error %d on %s.",
                                error, mp->m_fsname);
                        goto error0;
                }
                /* 
                 * Change the inode free counts and log the ag/sb changes.
                 */
@@ -1165,28 +1193,10 @@ xfs_difree(
                xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
        }
-#ifdef DEBUG
+        error = xfs_check_agi_freecount(cur, agi);
-        if (cur->bc_nlevels == 1) {
+        if (error)
-                int freecount = 0;
+                goto error0;
-                if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
-                        goto error0;
-                do {
-                        if ((error = xfs_inobt_get_rec(cur,
-                                        &rec.ir_startino,
-                                        &rec.ir_freecount,
-                                        &rec.ir_free, &i)))
-                                goto error0;
-                        if (i) {
-                                freecount += rec.ir_freecount;
-                                if ((error = xfs_btree_increment(cur, 0, &i)))
-                                        goto error0;
-                        }
-                } while (i == 1);
-                ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
-                       XFS_FORCED_SHUTDOWN(mp));
-        }
-#endif
        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
        return 0;
@@ -1297,9 +1307,7 @@ xfs_imap(
                chunk_agbno = agbno - offset_agbno;
        } else {
                xfs_btree_cur_t *cur;   /* inode btree cursor */
-                xfs_agino_t     chunk_agino; /* first agino in inode chunk */
+                xfs_inobt_rec_incore_t chunk_rec;
-                __int32_t       chunk_cnt; /* count of free inodes in chunk */
-                xfs_inofree_t   chunk_free; /* mask of free inodes in chunk */
                xfs_buf_t       *agbp;  /* agi buffer */
                int             i;      /* temp state */
@@ -1315,15 +1323,14 @@ xfs_imap(
                }
                cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
-                error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i);
+                error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
                if (error) {
                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
-                                        "xfs_inobt_lookup_le() failed");
+                                        "xfs_inobt_lookup() failed");
                        goto error0;
                }
-                error = xfs_inobt_get_rec(cur, &chunk_agino, &chunk_cnt,
+                error = xfs_inobt_get_rec(cur, &chunk_rec, &i);
-                                &chunk_free, &i);
                if (error) {
                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
                                        "xfs_inobt_get_rec() failed");
@@ -1341,7 +1348,7 @@ xfs_imap(
                xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
                if (error)
                        return error;
-                chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_agino);
+                chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_rec.ir_startino);
                offset_agbno = agbno - chunk_agbno;
        }
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index aeee8278f92c..bb5385475e1f 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -150,23 +150,15 @@ xfs_ialloc_pagi_init(
        xfs_agnumber_t  agno);          /* allocation group number */
 /*
- * Lookup the first record greater than or equal to ino
+ * Lookup a record by ino in the btree given by cur.
- * in the btree given by cur.
 */
-int xfs_inobt_lookup_ge(struct xfs_btree_cur *cur, xfs_agino_t ino,
+int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
-                __int32_t fcnt, xfs_inofree_t free, int *stat);
+                xfs_lookup_t dir, int *stat);
-/*
- * Lookup the first record less than or equal to ino
- * in the btree given by cur.
- */
-int xfs_inobt_lookup_le(struct xfs_btree_cur *cur, xfs_agino_t ino,
-                __int32_t fcnt, xfs_inofree_t free, int *stat);
 /*
 * Get the data from the pointed-to record.
 */
-extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_agino_t *ino,
+extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
-                             __int32_t *fcnt, xfs_inofree_t *free, int *stat);
+                xfs_inobt_rec_incore_t *rec, int *stat);
 #endif  /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index ecbf8b4d2e2e..80e526489be5 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -82,7 +82,6 @@ xfs_inode_alloc(
        memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
        ip->i_flags = 0;
        ip->i_update_core = 0;
-        ip->i_update_size = 0;
        ip->i_delayed_blks = 0;
        memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
        ip->i_size = 0;
@@ -456,32 +455,6 @@ out_error_or_again:
        return error;
 }
-/*
- * Look for the inode corresponding to the given ino in the hash table.
- * If it is there and its i_transp pointer matches tp, return it.
- * Otherwise, return NULL.
- */
-xfs_inode_t *
-xfs_inode_incore(xfs_mount_t    *mp,
-                 xfs_ino_t      ino,
-                 xfs_trans_t    *tp)
-{
-        xfs_inode_t     *ip;
-        xfs_perag_t     *pag;
-        pag = xfs_get_perag(mp, ino);
-        read_lock(&pag->pag_ici_lock);
-        ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ino));
-        read_unlock(&pag->pag_ici_lock);
-        xfs_put_perag(mp, pag);
-        /* the returned inode must match the transaction */
-        if (ip && (ip->i_transp != tp))
-                return NULL;
-        return ip;
-}
 /*
 * Decrement reference count of an inode structure and unlock it.
 *
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index da428b3fe0f5..c1dc7ef5a1d8 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -651,7 +651,7 @@ xfs_iformat_btree(
        return 0;
 }
-void
+STATIC void
 xfs_dinode_from_disk(
        xfs_icdinode_t          *to,
        xfs_dinode_t            *from)
@@ -1247,7 +1247,7 @@ xfs_isize_check(
 * In that case the pages will still be in memory, but the inode size
 * will never have been updated.
 */
-xfs_fsize_t
+STATIC xfs_fsize_t
 xfs_file_last_byte(
        xfs_inode_t     *ip)
 {
@@ -3837,7 +3837,7 @@ xfs_iext_inline_to_direct(
 /*
 * Resize an extent indirection array to new_size bytes.
 */
-void
+STATIC void
 xfs_iext_realloc_indirect(
        xfs_ifork_t     *ifp,           /* inode fork pointer */
        int             new_size)       /* new indirection array size */
@@ -3862,7 +3862,7 @@ xfs_iext_realloc_indirect(
 /*
 * Switch from indirection array to linear (direct) extent allocations.
 */
-void
+STATIC void
 xfs_iext_indirect_to_direct(
         xfs_ifork_t    *ifp)           /* inode fork pointer */
 {
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 65f24a3cc992..0b38b9a869ec 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -261,7 +261,6 @@ typedef struct xfs_inode {
        /* Miscellaneous state. */
        unsigned short          i_flags;        /* see defined flags below */
        unsigned char           i_update_core;  /* timestamps/size is dirty */
-        unsigned char           i_update_size;  /* di_size field is dirty */
        unsigned int            i_delayed_blks; /* count of delay alloc blks */
        xfs_icdinode_t          i_d;            /* most of ondisk inode */
@@ -468,8 +467,6 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
 /*
 * xfs_iget.c prototypes.
 */
-xfs_inode_t     *xfs_inode_incore(struct xfs_mount *, xfs_ino_t,
-                                  struct xfs_trans *);
 int             xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
                         uint, uint, xfs_inode_t **, xfs_daddr_t);
 void            xfs_iput(xfs_inode_t *, uint);
@@ -504,7 +501,6 @@ void		xfs_ipin(xfs_inode_t *);
 void            xfs_iunpin(xfs_inode_t *);
 int             xfs_iflush(xfs_inode_t *, uint);
 void            xfs_ichgtime(xfs_inode_t *, int);
-xfs_fsize_t     xfs_file_last_byte(xfs_inode_t *);
 void            xfs_lock_inodes(xfs_inode_t **, int, uint);
 void            xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
@@ -572,8 +568,6 @@ int		xfs_itobp(struct xfs_mount *, struct xfs_trans *,
                          struct xfs_buf **, uint);
 int             xfs_iread(struct xfs_mount *, struct xfs_trans *,
                          struct xfs_inode *, xfs_daddr_t, uint);
-void            xfs_dinode_from_disk(struct xfs_icdinode *,
-                                     struct xfs_dinode *);
 void            xfs_dinode_to_disk(struct xfs_dinode *,
                                   struct xfs_icdinode *);
 void            xfs_idestroy_fork(struct xfs_inode *, int);
@@ -592,8 +586,6 @@ void		xfs_iext_remove_inline(xfs_ifork_t *, xfs_extnum_t, int);
 void            xfs_iext_remove_direct(xfs_ifork_t *, xfs_extnum_t, int);
 void            xfs_iext_remove_indirect(xfs_ifork_t *, xfs_extnum_t, int);
 void            xfs_iext_realloc_direct(xfs_ifork_t *, int);
-void            xfs_iext_realloc_indirect(xfs_ifork_t *, int);
-void            xfs_iext_indirect_to_direct(xfs_ifork_t *);
 void            xfs_iext_direct_to_inline(xfs_ifork_t *, xfs_extnum_t);
 void            xfs_iext_inline_to_direct(xfs_ifork_t *, int);
 void            xfs_iext_destroy(xfs_ifork_t *);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 977c4aec587e..47d5b663c37e 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -263,14 +263,6 @@ xfs_inode_item_format(
        }
        /*
-         * We don't have to worry about re-ordering here because
-         * the update_size field is protected by the inode lock
-         * and we have that held in exclusive mode.
-         */
-        if (ip->i_update_size)
-                ip->i_update_size = 0;
-        /*
         * Make sure to get the latest atime from the Linux inode.
         */
        xfs_synchronize_atime(ip);
@@ -712,8 +704,6 @@ xfs_inode_item_unlock(
         * Clear out the fields of the inode log item particular
         * to the current transaction.
         */
-        iip->ili_ilock_recur = 0;
-        iip->ili_iolock_recur = 0;
        iip->ili_flags = 0;
        /*
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index a52ac125f055..65bae4c9b8bf 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -137,8 +137,6 @@ typedef struct xfs_inode_log_item {
        struct xfs_inode        *ili_inode;        /* inode ptr */
        xfs_lsn_t               ili_flush_lsn;     /* lsn at last flush */
        xfs_lsn_t               ili_last_lsn;      /* lsn at last transaction */
-        unsigned short          ili_ilock_recur;   /* lock recursion count */
-        unsigned short          ili_iolock_recur;  /* lock recursion count */
        unsigned short          ili_flags;         /* misc flags */
        unsigned short          ili_logged;        /* flushed logged data */
        unsigned int            ili_last_fields;   /* fields when flushed */
diff --git a/fs/xfs/xfs_inum.h b/fs/xfs/xfs_inum.h
index 7a28191cb0de..b8e4ee4e89a4 100644
--- a/fs/xfs/xfs_inum.h
+++ b/fs/xfs/xfs_inum.h
@@ -72,7 +72,6 @@ struct xfs_mount;
 #if XFS_BIG_INUMS
 #define XFS_MAXINUMBER          ((xfs_ino_t)((1ULL << 56) - 1ULL))
-#define XFS_INO64_OFFSET        ((xfs_ino_t)(1ULL << 32))
 #else
 #define XFS_MAXINUMBER          ((xfs_ino_t)((1ULL << 32) - 1ULL))
 #endif
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index aeb2d2221c7d..b68f9107e26c 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -39,7 +39,7 @@
 #include "xfs_error.h"
 #include "xfs_btree.h"
-int
+STATIC int
 xfs_internal_inum(
        xfs_mount_t     *mp,
        xfs_ino_t       ino)
@@ -353,9 +353,6 @@ xfs_bulkstat(
        int                     end_of_ag; /* set if we've seen the ag end */
        int                     error;  /* error code */
        int                     fmterror;/* bulkstat formatter result */
-        __int32_t               gcnt;   /* current btree rec's count */
-        xfs_inofree_t           gfree;  /* current btree rec's free mask */
-        xfs_agino_t             gino;   /* current btree rec's start inode */
        int                     i;      /* loop index */
        int                     icount; /* count of inodes good in irbuf */
        size_t                  irbsize; /* size of irec buffer in bytes */
@@ -442,40 +439,43 @@ xfs_bulkstat(
                 * we need to get the remainder of the chunk we're in.
                 */
                if (agino > 0) {
+                        xfs_inobt_rec_incore_t r;
                        /*
                         * Lookup the inode chunk that this inode lives in.
                         */
-                        error = xfs_inobt_lookup_le(cur, agino, 0, 0, &tmp);
+                        error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE,
+                                                 &tmp);
                        if (!error &&   /* no I/O error */
                            tmp &&      /* lookup succeeded */
                                        /* got the record, should always work */
-                            !(error = xfs_inobt_get_rec(cur, &gino, &gcnt,
+                            !(error = xfs_inobt_get_rec(cur, &r, &i)) &&
-                                    &gfree, &i)) &&
                            i == 1 &&
                                        /* this is the right chunk */
-                            agino < gino + XFS_INODES_PER_CHUNK &&
+                            agino < r.ir_startino + XFS_INODES_PER_CHUNK &&
                                        /* lastino was not last in chunk */
-                            (chunkidx = agino - gino + 1) <
+                            (chunkidx = agino - r.ir_startino + 1) <
                                    XFS_INODES_PER_CHUNK &&
                                        /* there are some left allocated */
                            xfs_inobt_maskn(chunkidx,
-                                    XFS_INODES_PER_CHUNK - chunkidx) & ~gfree) {
+                                    XFS_INODES_PER_CHUNK - chunkidx) &
+                                    ~r.ir_free) {
                                /*
                                 * Grab the chunk record.  Mark all the
                                 * uninteresting inodes (because they're
                                 * before our start point) free.
                                 */
                                for (i = 0; i < chunkidx; i++) {
-                                        if (XFS_INOBT_MASK(i) & ~gfree)
+                                        if (XFS_INOBT_MASK(i) & ~r.ir_free)
-                                                gcnt++;
+                                                r.ir_freecount++;
                                }
-                                gfree |= xfs_inobt_maskn(0, chunkidx);
+                                r.ir_free |= xfs_inobt_maskn(0, chunkidx);
-                                irbp->ir_startino = gino;
+                                irbp->ir_startino = r.ir_startino;
-                                irbp->ir_freecount = gcnt;
+                                irbp->ir_freecount = r.ir_freecount;
-                                irbp->ir_free = gfree;
+                                irbp->ir_free = r.ir_free;
                                irbp++;
-                                agino = gino + XFS_INODES_PER_CHUNK;
+                                agino = r.ir_startino + XFS_INODES_PER_CHUNK;
-                                icount = XFS_INODES_PER_CHUNK - gcnt;
+                                icount = XFS_INODES_PER_CHUNK - r.ir_freecount;
                        } else {
                                /*
                                 * If any of those tests failed, bump the
@@ -493,7 +493,7 @@ xfs_bulkstat(
                        /*
                         * Start of ag.  Lookup the first inode chunk.
                         */
-                        error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &tmp);
+                        error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &tmp);
                        icount = 0;
                }
                /*
@@ -501,6 +501,8 @@ xfs_bulkstat(
                 * until we run out of inodes or space in the buffer.
                 */
                while (irbp < irbufend && icount < ubcount) {
+                        xfs_inobt_rec_incore_t r;
                        /*
                         * Loop as long as we're unable to read the
                         * inode btree.
@@ -510,51 +512,55 @@ xfs_bulkstat(
                                if (XFS_AGINO_TO_AGBNO(mp, agino) >=
                                                be32_to_cpu(agi->agi_length))
                                        break;
-                                error = xfs_inobt_lookup_ge(cur, agino, 0, 0,
+                                error = xfs_inobt_lookup(cur, agino,
-                                                            &tmp);
+                                                         XFS_LOOKUP_GE, &tmp);
                                cond_resched();
                        }
                        /*
                         * If ran off the end of the ag either with an error,
                         * or the normal way, set end and stop collecting.
                         */
-                        if (error ||
+                        if (error) {
-                            (error = xfs_inobt_get_rec(cur, &gino, &gcnt,
-                                    &gfree, &i)) ||
-                            i == 0) {
                                end_of_ag = 1;
                                break;
                        }
+                        error = xfs_inobt_get_rec(cur, &r, &i);
+                        if (error || i == 0) {
+                                end_of_ag = 1;
+                                break;
+                        }
                        /*
                         * If this chunk has any allocated inodes, save it.
                         * Also start read-ahead now for this chunk.
                         */
-                        if (gcnt < XFS_INODES_PER_CHUNK) {
+                        if (r.ir_freecount < XFS_INODES_PER_CHUNK) {
                                /*
                                 * Loop over all clusters in the next chunk.
                                 * Do a readahead if there are any allocated
                                 * inodes in that cluster.
                                 */
-                                for (agbno = XFS_AGINO_TO_AGBNO(mp, gino),
+                                agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino);
-                                     chunkidx = 0;
+                                for (chunkidx = 0;
                                     chunkidx < XFS_INODES_PER_CHUNK;
                                     chunkidx += nicluster,
                                     agbno += nbcluster) {
-                                        if (xfs_inobt_maskn(chunkidx,
+                                        if (xfs_inobt_maskn(chunkidx, nicluster)
-                                                            nicluster) & ~gfree)
+                                                        & ~r.ir_free)
                                                xfs_btree_reada_bufs(mp, agno,
                                                        agbno, nbcluster);
                                }
-                                irbp->ir_startino = gino;
+                                irbp->ir_startino = r.ir_startino;
-                                irbp->ir_freecount = gcnt;
+                                irbp->ir_freecount = r.ir_freecount;
-                                irbp->ir_free = gfree;
+                                irbp->ir_free = r.ir_free;
                                irbp++;
-                                icount += XFS_INODES_PER_CHUNK - gcnt;
+                                icount += XFS_INODES_PER_CHUNK - r.ir_freecount;
                        }
                        /*
                         * Set agino to after this chunk and bump the cursor.
                         */
-                        agino = gino + XFS_INODES_PER_CHUNK;
+                        agino = r.ir_startino + XFS_INODES_PER_CHUNK;
                        error = xfs_btree_increment(cur, 0, &tmp);
                        cond_resched();
                }
@@ -820,9 +826,7 @@ xfs_inumbers(
        int             bufidx;
        xfs_btree_cur_t *cur;
        int             error;
-        __int32_t       gcnt;
+        xfs_inobt_rec_incore_t r;
-        xfs_inofree_t   gfree;
-        xfs_agino_t     gino;
        int             i;
        xfs_ino_t       ino;
        int             left;
@@ -855,7 +859,8 @@ xfs_inumbers(
                                continue;
                        }
                        cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno);
-                        error = xfs_inobt_lookup_ge(cur, agino, 0, 0, &tmp);
+                        error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_GE,
+                                                 &tmp);
                        if (error) {
                                xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
                                cur = NULL;
@@ -870,9 +875,8 @@ xfs_inumbers(
                                continue;
                        }
                }
-                if ((error = xfs_inobt_get_rec(cur, &gino, &gcnt, &gfree,
+                error = xfs_inobt_get_rec(cur, &r, &i);
-                        &i)) ||
+                if (error || i == 0) {
-                    i == 0) {
                        xfs_buf_relse(agbp);
                        agbp = NULL;
                        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
@@ -881,10 +885,12 @@ xfs_inumbers(
                        agino = 0;
                        continue;
                }
-                agino = gino + XFS_INODES_PER_CHUNK - 1;
+                agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1;
-                buffer[bufidx].xi_startino = XFS_AGINO_TO_INO(mp, agno, gino);
+                buffer[bufidx].xi_startino =
-                buffer[bufidx].xi_alloccount = XFS_INODES_PER_CHUNK - gcnt;
+                        XFS_AGINO_TO_INO(mp, agno, r.ir_startino);
-                buffer[bufidx].xi_allocmask = ~gfree;
+                buffer[bufidx].xi_alloccount =
+                        XFS_INODES_PER_CHUNK - r.ir_freecount;
+                buffer[bufidx].xi_allocmask = ~r.ir_free;
                bufidx++;
                left--;
                if (bufidx == bcount) {
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index 1fb04e7deb61..20792bf45946 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -99,11 +99,6 @@ xfs_bulkstat_one(
        void                    *dibuff,
        int                     *stat);
-int
-xfs_internal_inum(
-        xfs_mount_t             *mp,
-        xfs_ino_t               ino);
 typedef int (*inumbers_fmt_pf)(
        void                    __user *ubuffer, /* buffer to write to */
        const xfs_inogrp_t      *buffer,        /* buffer to read from */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index bcad5f4c1fd1..679c7c4926a2 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -451,8 +451,6 @@ extern int	 xlog_find_tail(xlog_t	*log,
 extern int       xlog_recover(xlog_t *log);
 extern int       xlog_recover_finish(xlog_t *log);
 extern void      xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
-extern void      xlog_recover_process_iunlinks(xlog_t *log);
 extern struct xfs_buf *xlog_get_bp(xlog_t *, int);
 extern void      xlog_put_bp(struct xfs_buf *);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 47da2fb45377..1099395d7d6c 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3263,7 +3263,7 @@ xlog_recover_process_one_iunlink(
 * freeing of the inode and its removal from the list must be
 * atomic.
 */
-void
+STATIC void
 xlog_recover_process_iunlinks(
        xlog_t          *log)
 {
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 5c6f092659c1..8b6c9e807efb 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1568,7 +1568,7 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
 *
 * The m_sb_lock must be held when this routine is called.
 */
-int
+STATIC int
 xfs_mod_incore_sb_unlocked(
        xfs_mount_t     *mp,
        xfs_sb_field_t  field,
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index a5122382afde..a6c023bc0fb2 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -414,13 +414,10 @@ typedef struct xfs_mod_sb {
 extern int      xfs_log_sbcount(xfs_mount_t *, uint);
 extern int      xfs_mountfs(xfs_mount_t *mp);
-extern void     xfs_mountfs_check_barriers(xfs_mount_t *mp);
 extern void     xfs_unmountfs(xfs_mount_t *);
 extern int      xfs_unmountfs_writesb(xfs_mount_t *);
 extern int      xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int);
-extern int      xfs_mod_incore_sb_unlocked(xfs_mount_t *, xfs_sb_field_t,
-                        int64_t, int);
 extern int      xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *,
                        uint, int);
 extern int      xfs_mount_log_sb(xfs_mount_t *, __int64_t);
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index afee7eb24323..4b0613d99faa 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -564,35 +564,6 @@ xfs_mru_cache_lookup(
 }
 /*
- * To look up an element using its key, but leave its location in the internal
- * lists alone, call xfs_mru_cache_peek().  If the element isn't found, this
- * function returns NULL.
- *
- * See the comments above the declaration of the xfs_mru_cache_lookup() function
- * for important locking information pertaining to this call.
- */
-void *
-xfs_mru_cache_peek(
-        xfs_mru_cache_t *mru,
-        unsigned long   key)
-{
-        xfs_mru_cache_elem_t *elem;
-        ASSERT(mru && mru->lists);
-        if (!mru || !mru->lists)
-                return NULL;
-        spin_lock(&mru->lock);
-        elem = radix_tree_lookup(&mru->store, key);
-        if (!elem)
-                spin_unlock(&mru->lock);
-        else
-                __release(mru_lock); /* help sparse not be stupid */
-        return elem ? elem->value : NULL;
-}
-/*
 * To release the internal data structure spinlock after having performed an
 * xfs_mru_cache_lookup() or an xfs_mru_cache_peek(), call xfs_mru_cache_done()
 * with the data store pointer.
diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h
index dd58ea1bbebe..5d439f34b0c9 100644
--- a/fs/xfs/xfs_mru_cache.h
+++ b/fs/xfs/xfs_mru_cache.h
@@ -49,7 +49,6 @@ int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key,
 void * xfs_mru_cache_remove(struct xfs_mru_cache *mru, unsigned long key);
 void xfs_mru_cache_delete(struct xfs_mru_cache *mru, unsigned long key);
 void *xfs_mru_cache_lookup(struct xfs_mru_cache *mru, unsigned long key);
-void *xfs_mru_cache_peek(struct xfs_mru_cache *mru, unsigned long key);
 void xfs_mru_cache_done(struct xfs_mru_cache *mru);
 #endif /* __XFS_MRU_CACHE_H__ */
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index fea68615ed23..3f816ad7ff19 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -88,90 +88,6 @@ xfs_write_clear_setuid(
 }
 /*
- * Handle logging requirements of various synchronous types of write.
- */
-int
-xfs_write_sync_logforce(
-        xfs_mount_t     *mp,
-        xfs_inode_t     *ip)
-{
-        int             error = 0;
-        /*
-         * If we're treating this as O_DSYNC and we have not updated the
-         * size, force the log.
-         */
-        if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) &&
-            !(ip->i_update_size)) {
-                xfs_inode_log_item_t    *iip = ip->i_itemp;
-                /*
-                 * If an allocation transaction occurred
-                 * without extending the size, then we have to force
-                 * the log up the proper point to ensure that the
-                 * allocation is permanent.  We can't count on
-                 * the fact that buffered writes lock out direct I/O
-                 * writes - the direct I/O write could have extended
-                 * the size nontransactionally, then finished before
-                 * we started.  xfs_write_file will think that the file
-                 * didn't grow but the update isn't safe unless the
-                 * size change is logged.
-                 *
-                 * Force the log if we've committed a transaction
-                 * against the inode or if someone else has and
-                 * the commit record hasn't gone to disk (e.g.
-                 * the inode is pinned).  This guarantees that
-                 * all changes affecting the inode are permanent
-                 * when we return.
-                 */
-                if (iip && iip->ili_last_lsn) {
-                        error = _xfs_log_force(mp, iip->ili_last_lsn,
-                                        XFS_LOG_FORCE | XFS_LOG_SYNC, NULL);
-                } else if (xfs_ipincount(ip) > 0) {
-                        error = _xfs_log_force(mp, (xfs_lsn_t)0,
-                                        XFS_LOG_FORCE | XFS_LOG_SYNC, NULL);
-                }
-        } else {
-                xfs_trans_t     *tp;
-                /*
-                 * O_SYNC or O_DSYNC _with_ a size update are handled
-                 * the same way.
-                 *
-                 * If the write was synchronous then we need to make
-                 * sure that the inode modification time is permanent.
-                 * We'll have updated the timestamp above, so here
-                 * we use a synchronous transaction to log the inode.
-                 * It's not fast, but it's necessary.
-                 *
-                 * If this a dsync write and the size got changed
-                 * non-transactionally, then we need to ensure that
-                 * the size change gets logged in a synchronous
-                 * transaction.
-                 */
-                tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
-                if ((error = xfs_trans_reserve(tp, 0,
-                                                XFS_SWRITE_LOG_RES(mp),
-                                                0, 0, 0))) {
-                        /* Transaction reserve failed */
-                        xfs_trans_cancel(tp, 0);
-                } else {
-                        /* Transaction reserve successful */
-                        xfs_ilock(ip, XFS_ILOCK_EXCL);
-                        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-                        xfs_trans_ihold(tp, ip);
-                        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-                        xfs_trans_set_sync(tp);
-                        error = xfs_trans_commit(tp, 0);
-                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                }
-        }
-        return error;
-}
-/*
 * Force a shutdown of the filesystem instantly while keeping
 * the filesystem consistent. We don't do an unmount here; just shutdown
 * the shop, make sure that absolutely nothing persistent happens to
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
index f76c003ec55d..f5e4874c37d8 100644
--- a/fs/xfs/xfs_rw.h
+++ b/fs/xfs/xfs_rw.h
@@ -68,7 +68,6 @@ xfs_get_extsz_hint(
 * Prototypes for functions in xfs_rw.c.
 */
 extern int xfs_write_clear_setuid(struct xfs_inode *ip);
-extern int xfs_write_sync_logforce(struct xfs_mount *mp, struct xfs_inode *ip);
 extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
 extern int xfs_bioerror(struct xfs_buf *bp);
 extern int xfs_bioerror_relse(struct xfs_buf *bp);
@@ -78,10 +77,4 @@ extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp,
 extern void xfs_ioerror_alert(char *func, struct xfs_mount *mp,
                                xfs_buf_t *bp, xfs_daddr_t blkno);
-/*
- * Prototypes for functions in xfs_vnodeops.c.
- */
-extern int xfs_free_eofblocks(struct xfs_mount *mp, struct xfs_inode *ip,
-                        int flags);
 #endif /* __XFS_RW_H__ */
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 775249a54f6f..ed47fc77759c 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -68,7 +68,7 @@ typedef struct xfs_trans_header {
 #define XFS_TRANS_GROWFS                14
 #define XFS_TRANS_STRAT_WRITE           15
 #define XFS_TRANS_DIOSTRAT              16
-#define XFS_TRANS_WRITE_SYNC            17
+/* 17 was XFS_TRANS_WRITE_SYNC */
 #define XFS_TRANS_WRITEID               18
 #define XFS_TRANS_ADDAFORK              19
 #define XFS_TRANS_ATTRINVAL             20
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 8ee2f8c8b0a6..218829e6a152 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -307,7 +307,7 @@ xfs_trans_read_buf(
                        return (flags & XFS_BUF_TRYLOCK) ?
                                        EAGAIN : XFS_ERROR(ENOMEM);
-                if ((bp != NULL) && (XFS_BUF_GETERROR(bp) != 0)) {
+                if (XFS_BUF_GETERROR(bp) != 0) {
                        xfs_ioerror_alert("xfs_trans_read_buf", mp,
                                          bp, blkno);
                        error = XFS_BUF_GETERROR(bp);
@@ -315,7 +315,7 @@ xfs_trans_read_buf(
                        return error;
                }
 #ifdef DEBUG
-                if (xfs_do_error && (bp != NULL)) {
+                if (xfs_do_error) {
                        if (xfs_error_target == target) {
                                if (((xfs_req_num++) % xfs_error_mod) == 0) {
                                        xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 23d276af2e0c..785ff101da0a 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -49,30 +49,7 @@ xfs_trans_inode_broot_debug(
 /*
- * Get and lock the inode for the caller if it is not already
+ * Get an inode and join it to the transaction.
- * locked within the given transaction.  If it is already locked
- * within the transaction, just increment its lock recursion count
- * and return a pointer to it.
- *
- * For an inode to be locked in a transaction, the inode lock, as
- * opposed to the io lock, must be taken exclusively.  This ensures
- * that the inode can be involved in only 1 transaction at a time.
- * Lock recursion is handled on the io lock, but only for lock modes
- * of equal or lesser strength.  That is, you can recur on the io lock
- * held EXCL with a SHARED request but not vice versa.  Also, if
- * the inode is already a part of the transaction then you cannot
- * go from not holding the io lock to having it EXCL or SHARED.
- *
- * Use the inode cache routine xfs_inode_incore() to find the inode
- * if it is already owned by this transaction.
- *
- * If we don't already own the inode, use xfs_iget() to get it.
- * Since the inode log item structure is embedded in the incore
- * inode structure and is initialized when the inode is brought
- * into memory, there is nothing to do with it here.
- *
- * If the given transaction pointer is NULL, just call xfs_iget().
- * This simplifies code which must handle both cases.
 */
 int
 xfs_trans_iget(
@@ -84,62 +61,11 @@ xfs_trans_iget(
        xfs_inode_t     **ipp)
 {
        int                     error;
-        xfs_inode_t             *ip;
-        /*
-         * If the transaction pointer is NULL, just call the normal
-         * xfs_iget().
-         */
-        if (tp == NULL)
-                return xfs_iget(mp, NULL, ino, flags, lock_flags, ipp, 0);
-        /*
-         * If we find the inode in core with this transaction
-         * pointer in its i_transp field, then we know we already
-         * have it locked.  In this case we just increment the lock
-         * recursion count and return the inode to the caller.
-         * Assert that the inode is already locked in the mode requested
-         * by the caller.  We cannot do lock promotions yet, so
-         * die if someone gets this wrong.
-         */
-        if ((ip = xfs_inode_incore(tp->t_mountp, ino, tp)) != NULL) {
-                /*
-                 * Make sure that the inode lock is held EXCL and
-                 * that the io lock is never upgraded when the inode
-                 * is already a part of the transaction.
-                 */
-                ASSERT(ip->i_itemp != NULL);
-                ASSERT(lock_flags & XFS_ILOCK_EXCL);
-                ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-                ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) ||
-                       xfs_isilocked(ip, XFS_IOLOCK_EXCL));
-                ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) ||
-                       (ip->i_itemp->ili_flags & XFS_ILI_IOLOCKED_EXCL));
-                ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) ||
-                       xfs_isilocked(ip, XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED));
-                ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) ||
-                       (ip->i_itemp->ili_flags & XFS_ILI_IOLOCKED_ANY));
-                if (lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) {
-                        ip->i_itemp->ili_iolock_recur++;
-                }
-                if (lock_flags & XFS_ILOCK_EXCL) {
-                        ip->i_itemp->ili_ilock_recur++;
-                }
-                *ipp = ip;
-                return 0;
-        }
-        ASSERT(lock_flags & XFS_ILOCK_EXCL);
-        error = xfs_iget(tp->t_mountp, tp, ino, flags, lock_flags, &ip, 0);
-        if (error) {
-                return error;
-        }
-        ASSERT(ip != NULL);
-        xfs_trans_ijoin(tp, ip, lock_flags);
+        error = xfs_iget(mp, tp, ino, flags, lock_flags, ipp, 0);
-        *ipp = ip;
+        if (!error && tp)
-        return 0;
+                xfs_trans_ijoin(tp, *ipp, lock_flags);
+        return error;
 }
 /*
@@ -163,8 +89,6 @@ xfs_trans_ijoin(
                xfs_inode_item_init(ip, ip->i_mount);
        iip = ip->i_itemp;
        ASSERT(iip->ili_flags == 0);
-        ASSERT(iip->ili_ilock_recur == 0);
-        ASSERT(iip->ili_iolock_recur == 0);
        /*
         * Get a log_item_desc to point at the new item.
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 492d75bae2bf..a434f287962d 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -611,7 +611,7 @@ xfs_fsync(
        xfs_inode_t     *ip)
 {
        xfs_trans_t     *tp;
-        int             error;
+        int             error = 0;
        int             log_flushed = 0, changed = 1;
        xfs_itrace_entry(ip);
@@ -619,14 +619,9 @@ xfs_fsync(
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                return XFS_ERROR(EIO);
-        /* capture size updates in I/O completion before writing the inode. */
-        error = xfs_wait_on_pages(ip, 0, -1);
-        if (error)
-                return XFS_ERROR(error);
        /*
         * We always need to make sure that the required inode state is safe on
-         * disk.  The vnode might be clean but we still might need to force the
+         * disk.  The inode might be clean but we still might need to force the
         * log because of committed transactions that haven't hit the disk yet.
         * Likewise, there could be unflushed non-transactional changes to the
         * inode core that have to go to disk and this requires us to issue
@@ -638,7 +633,7 @@ xfs_fsync(
         */
        xfs_ilock(ip, XFS_ILOCK_SHARED);
-        if (!(ip->i_update_size || ip->i_update_core)) {
+        if (!ip->i_update_core) {
                /*
                 * Timestamps/size haven't changed since last inode flush or
                 * inode transaction commit.  That means either nothing got
@@ -718,7 +713,7 @@ xfs_fsync(
 * when the link count isn't zero and by xfs_dm_punch_hole() when
 * punching a hole to EOF.
 */
-int
+STATIC int
 xfs_free_eofblocks(
        xfs_mount_t     *mp,
        xfs_inode_t     *ip,
@@ -1476,8 +1471,8 @@ xfs_create(
        if (error == ENOSPC) {
                /* flush outstanding delalloc blocks and retry */
                xfs_flush_inodes(dp);
-                error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0,
+                error = xfs_trans_reserve(tp, resblks, log_res, 0,
-                        XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
+                                XFS_TRANS_PERM_LOG_RES, log_count);
        }
        if (error == ENOSPC) {
                /* No space at all so try a "no-allocation" reservation */
author	Ingo Molnar <mingo@elte.hu>	2009-09-19 05:27:32 -0400
committer	Ingo Molnar <mingo@elte.hu>	2009-09-19 05:28:41 -0400
commit	929bf0d0156562ce631728b6fa53d68004d456d2 (patch)
tree	739063990a8077b29ef97e69d73bce94573daae4 /fs
parent	def0a9b2573e00ab0b486cb5382625203ab4c4a6 (diff)
parent	202c4675c55ddf6b443c7e057d2dff6b42ef71aa (diff)