Linux-2.6.12-rc2v2.6.12-rc2

Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
author: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 18:20:36 -0400
committer: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 18:20:36 -0400
commit: 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree: 0bba044c4ce775e45a88a51686b5d9f90697ea9d /fs/jfs
44 files changed, 33077 insertions, 0 deletions
diff --git a/fs/jfs/Makefile b/fs/jfs/Makefile
new file mode 100644
index 000000000000..6f1e0e95587a
--- /dev/null
+++ b/fs/jfs/Makefile
@@ -0,0 +1,15 @@
+#
+# Makefile for the Linux JFS filesystem routines.
+#
+obj-$(CONFIG_JFS_FS) += jfs.o
+jfs-y    := super.o file.o inode.o namei.o jfs_mount.o jfs_umount.o \
+            jfs_xtree.o jfs_imap.o jfs_debug.o jfs_dmap.o \
+            jfs_unicode.o jfs_dtree.o jfs_inode.o \
+            jfs_extent.o symlink.o jfs_metapage.o \
+            jfs_logmgr.o jfs_txnmgr.o jfs_uniupr.o resize.o xattr.o
+jfs-$(CONFIG_JFS_POSIX_ACL) += acl.o
+EXTRA_CFLAGS += -D_JFS_4K
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
new file mode 100644
index 000000000000..8d2a9ab981d4
--- /dev/null
+++ b/fs/jfs/acl.c
@@ -0,0 +1,234 @@
+/*
+ *   Copyright (C) International Business Machines  Corp., 2002-2004
+ *   Copyright (C) Andreas Gruenbacher, 2001
+ *   Copyright (C) Linus Torvalds, 1991, 1992
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/quotaops.h>
+#include "jfs_incore.h"
+#include "jfs_xattr.h"
+#include "jfs_acl.h"
+static struct posix_acl *jfs_get_acl(struct inode *inode, int type)
+{
+        struct posix_acl *acl;
+        char *ea_name;
+        struct jfs_inode_info *ji = JFS_IP(inode);
+        struct posix_acl **p_acl;
+        int size;
+        char *value = NULL;
+        switch(type) {
+                case ACL_TYPE_ACCESS:
+                        ea_name = XATTR_NAME_ACL_ACCESS;
+                        p_acl = &ji->i_acl;
+                        break;
+                case ACL_TYPE_DEFAULT:
+                        ea_name = XATTR_NAME_ACL_DEFAULT;
+                        p_acl = &ji->i_default_acl;
+                        break;
+                default:
+                        return ERR_PTR(-EINVAL);
+        }
+        if (*p_acl != JFS_ACL_NOT_CACHED)
+                return posix_acl_dup(*p_acl);
+        size = __jfs_getxattr(inode, ea_name, NULL, 0);
+        if (size > 0) {
+                value = kmalloc(size, GFP_KERNEL);
+                if (!value)
+                        return ERR_PTR(-ENOMEM);
+                size = __jfs_getxattr(inode, ea_name, value, size);
+        }
+        if (size < 0) {
+                if (size == -ENODATA) {
+                        *p_acl = NULL;
+                        acl = NULL;
+                } else
+                        acl = ERR_PTR(size);
+        } else {
+                acl = posix_acl_from_xattr(value, size);
+                if (!IS_ERR(acl))
+                        *p_acl = posix_acl_dup(acl);
+        }
+        if (value)
+                kfree(value);
+        return acl;
+}
+static int jfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
+{
+        char *ea_name;
+        struct jfs_inode_info *ji = JFS_IP(inode);
+        struct posix_acl **p_acl;
+        int rc;
+        int size = 0;
+        char *value = NULL;
+        if (S_ISLNK(inode->i_mode))
+                return -EOPNOTSUPP;
+        switch(type) {
+                case ACL_TYPE_ACCESS:
+                        ea_name = XATTR_NAME_ACL_ACCESS;
+                        p_acl = &ji->i_acl;
+                        break;
+                case ACL_TYPE_DEFAULT:
+                        ea_name = XATTR_NAME_ACL_DEFAULT;
+                        p_acl = &ji->i_default_acl;
+                        if (!S_ISDIR(inode->i_mode))
+                                return acl ? -EACCES : 0;
+                        break;
+                default:
+                        return -EINVAL;
+        }
+        if (acl) {
+                size = xattr_acl_size(acl->a_count);
+                value = kmalloc(size, GFP_KERNEL);
+                if (!value)
+                        return -ENOMEM;
+                rc = posix_acl_to_xattr(acl, value, size);
+                if (rc < 0)
+                        goto out;
+        }
+        rc = __jfs_setxattr(inode, ea_name, value, size, 0);
+out:
+        if (value)
+                kfree(value);
+        if (!rc) {
+                if (*p_acl && (*p_acl != JFS_ACL_NOT_CACHED))
+                        posix_acl_release(*p_acl);
+                *p_acl = posix_acl_dup(acl);
+        }
+        return rc;
+}
+static int jfs_check_acl(struct inode *inode, int mask)
+{
+        struct jfs_inode_info *ji = JFS_IP(inode);
+        if (ji->i_acl == JFS_ACL_NOT_CACHED) {
+                struct posix_acl *acl = jfs_get_acl(inode, ACL_TYPE_ACCESS);
+                if (IS_ERR(acl))
+                        return PTR_ERR(acl);
+                posix_acl_release(acl);
+        }
+        if (ji->i_acl)
+                return posix_acl_permission(inode, ji->i_acl, mask);
+        return -EAGAIN;
+}
+int jfs_permission(struct inode *inode, int mask, struct nameidata *nd)
+{
+        return generic_permission(inode, mask, jfs_check_acl);
+}
+int jfs_init_acl(struct inode *inode, struct inode *dir)
+{
+        struct posix_acl *acl = NULL;
+        struct posix_acl *clone;
+        mode_t mode;
+        int rc = 0;
+        if (S_ISLNK(inode->i_mode))
+                return 0;
+        acl = jfs_get_acl(dir, ACL_TYPE_DEFAULT);
+        if (IS_ERR(acl))
+                return PTR_ERR(acl);
+        if (acl) {
+                if (S_ISDIR(inode->i_mode)) {
+                        rc = jfs_set_acl(inode, ACL_TYPE_DEFAULT, acl);
+                        if (rc)
+                                goto cleanup;
+                }
+                clone = posix_acl_clone(acl, GFP_KERNEL);
+                if (!clone) {
+                        rc = -ENOMEM;
+                        goto cleanup;
+                }
+                mode = inode->i_mode;
+                rc = posix_acl_create_masq(clone, &mode);
+                if (rc >= 0) {
+                        inode->i_mode = mode;
+                        if (rc > 0)
+                                rc = jfs_set_acl(inode, ACL_TYPE_ACCESS, clone);
+                }
+                posix_acl_release(clone);
+cleanup:
+                posix_acl_release(acl);
+        } else
+                inode->i_mode &= ~current->fs->umask;
+        return rc;
+}
+static int jfs_acl_chmod(struct inode *inode)
+{
+        struct posix_acl *acl, *clone;
+        int rc;
+        if (S_ISLNK(inode->i_mode))
+                return -EOPNOTSUPP;
+        acl = jfs_get_acl(inode, ACL_TYPE_ACCESS);
+        if (IS_ERR(acl) || !acl)
+                return PTR_ERR(acl);
+        clone = posix_acl_clone(acl, GFP_KERNEL);
+        posix_acl_release(acl);
+        if (!clone)
+                return -ENOMEM;
+        rc = posix_acl_chmod_masq(clone, inode->i_mode);
+        if (!rc)
+                rc = jfs_set_acl(inode, ACL_TYPE_ACCESS, clone);
+        posix_acl_release(clone);
+        return rc;
+}
+int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+        struct inode *inode = dentry->d_inode;
+        int rc;
+        rc = inode_change_ok(inode, iattr);
+        if (rc)
+                return rc;
+        if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
+            (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
+                if (DQUOT_TRANSFER(inode, iattr))
+                        return -EDQUOT;
+        }
+        rc = inode_setattr(inode, iattr);
+        if (!rc && (iattr->ia_valid & ATTR_MODE))
+                rc = jfs_acl_chmod(inode);
+        return rc;
+}
diff --git a/fs/jfs/endian24.h b/fs/jfs/endian24.h
new file mode 100644
index 000000000000..ab7cd0567c95
--- /dev/null
+++ b/fs/jfs/endian24.h
@@ -0,0 +1,49 @@
+/*
+ *   Copyright (c) International Business Machines Corp., 2001
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_ENDIAN24
+#define _H_ENDIAN24
+/*
+ *      endian24.h:
+ *
+ * Endian conversion for 24-byte data
+ *
+ */
+#define __swab24(x) \
+({ \
+        __u32 __x = (x); \
+        ((__u32)( \
+                ((__x & (__u32)0x000000ffUL) << 16) | \
+                 (__x & (__u32)0x0000ff00UL)        | \
+                ((__x & (__u32)0x00ff0000UL) >> 16) )); \
+})
+#if (defined(__KERNEL__) && defined(__LITTLE_ENDIAN)) || (defined(__BYTE_ORDER) && (__BYTE_ORDER == __LITTLE_ENDIAN))
+        #define __cpu_to_le24(x) ((__u32)(x))
+        #define __le24_to_cpu(x) ((__u32)(x))
+#else
+        #define __cpu_to_le24(x) __swab24(x)
+        #define __le24_to_cpu(x) __swab24(x)
+#endif
+#ifdef __KERNEL__
+        #define cpu_to_le24 __cpu_to_le24
+        #define le24_to_cpu __le24_to_cpu
+#endif
+#endif                          /* !_H_ENDIAN24 */
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
new file mode 100644
index 000000000000..a87b06fa8ff8
--- /dev/null
+++ b/fs/jfs/file.c
@@ -0,0 +1,119 @@
+/*
+ *   Copyright (c) International Business Machines Corp., 2000-2002
+ *   Portions Copyright (c) Christoph Hellwig, 2001-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/fs.h>
+#include "jfs_incore.h"
+#include "jfs_dmap.h"
+#include "jfs_txnmgr.h"
+#include "jfs_xattr.h"
+#include "jfs_acl.h"
+#include "jfs_debug.h"
+extern int jfs_commit_inode(struct inode *, int);
+extern void jfs_truncate(struct inode *);
+int jfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+        struct inode *inode = dentry->d_inode;
+        int rc = 0;
+        if (!(inode->i_state & I_DIRTY) ||
+            (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) {
+                /* Make sure committed changes hit the disk */
+                jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1);
+                return rc;
+        }
+        rc |= jfs_commit_inode(inode, 1);
+        return rc ? -EIO : 0;
+}
+static int jfs_open(struct inode *inode, struct file *file)
+{
+        int rc;
+        if ((rc = generic_file_open(inode, file)))
+                return rc;
+        /*
+         * We attempt to allow only one "active" file open per aggregate
+         * group.  Otherwise, appending to files in parallel can cause
+         * fragmentation within the files.
+         *
+         * If the file is empty, it was probably just created and going
+         * to be written to.  If it has a size, we'll hold off until the
+         * file is actually grown.
+         */
+        if (S_ISREG(inode->i_mode) && file->f_mode & FMODE_WRITE &&
+            (inode->i_size == 0)) {
+                struct jfs_inode_info *ji = JFS_IP(inode);
+                spin_lock_irq(&ji->ag_lock);
+                if (ji->active_ag == -1) {
+                        ji->active_ag = ji->agno;
+                        atomic_inc(
+                            &JFS_SBI(inode->i_sb)->bmap->db_active[ji->agno]);
+                }
+                spin_unlock_irq(&ji->ag_lock);
+        }
+        return 0;
+}
+static int jfs_release(struct inode *inode, struct file *file)
+{
+        struct jfs_inode_info *ji = JFS_IP(inode);
+        spin_lock_irq(&ji->ag_lock);
+        if (ji->active_ag != -1) {
+                struct bmap *bmap = JFS_SBI(inode->i_sb)->bmap;
+                atomic_dec(&bmap->db_active[ji->active_ag]);
+                ji->active_ag = -1;
+        }
+        spin_unlock_irq(&ji->ag_lock);
+        return 0;
+}
+struct inode_operations jfs_file_inode_operations = {
+        .truncate       = jfs_truncate,
+        .setxattr       = jfs_setxattr,
+        .getxattr       = jfs_getxattr,
+        .listxattr      = jfs_listxattr,
+        .removexattr    = jfs_removexattr,
+#ifdef CONFIG_JFS_POSIX_ACL
+        .setattr        = jfs_setattr,
+        .permission     = jfs_permission,
+#endif
+};
+struct file_operations jfs_file_operations = {
+        .open           = jfs_open,
+        .llseek         = generic_file_llseek,
+        .write          = generic_file_write,
+        .read           = generic_file_read,
+        .aio_read       = generic_file_aio_read,
+        .aio_write      = generic_file_aio_write,
+        .mmap           = generic_file_mmap,
+        .readv          = generic_file_readv,
+        .writev         = generic_file_writev,
+        .sendfile       = generic_file_sendfile,
+        .fsync          = jfs_fsync,
+        .release        = jfs_release,
+};
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
new file mode 100644
index 000000000000..7bc906677b0d
--- /dev/null
+++ b/fs/jfs/inode.c
@@ -0,0 +1,384 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *   Portions Copyright (C) Christoph Hellwig, 2001-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/fs.h>
+#include <linux/mpage.h>
+#include <linux/buffer_head.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include "jfs_incore.h"
+#include "jfs_filsys.h"
+#include "jfs_imap.h"
+#include "jfs_extent.h"
+#include "jfs_unicode.h"
+#include "jfs_debug.h"
+extern struct inode_operations jfs_dir_inode_operations;
+extern struct inode_operations jfs_file_inode_operations;
+extern struct inode_operations jfs_symlink_inode_operations;
+extern struct file_operations jfs_dir_operations;
+extern struct file_operations jfs_file_operations;
+struct address_space_operations jfs_aops;
+extern int freeZeroLink(struct inode *);
+void jfs_read_inode(struct inode *inode)
+{
+        if (diRead(inode)) { 
+                make_bad_inode(inode);
+                return;
+        }
+        if (S_ISREG(inode->i_mode)) {
+                inode->i_op = &jfs_file_inode_operations;
+                inode->i_fop = &jfs_file_operations;
+                inode->i_mapping->a_ops = &jfs_aops;
+        } else if (S_ISDIR(inode->i_mode)) {
+                inode->i_op = &jfs_dir_inode_operations;
+                inode->i_fop = &jfs_dir_operations;
+        } else if (S_ISLNK(inode->i_mode)) {
+                if (inode->i_size >= IDATASIZE) {
+                        inode->i_op = &page_symlink_inode_operations;
+                        inode->i_mapping->a_ops = &jfs_aops;
+                } else
+                        inode->i_op = &jfs_symlink_inode_operations;
+        } else {
+                inode->i_op = &jfs_file_inode_operations;
+                init_special_inode(inode, inode->i_mode, inode->i_rdev);
+        }
+}
+/*
+ * Workhorse of both fsync & write_inode
+ */
+int jfs_commit_inode(struct inode *inode, int wait)
+{
+        int rc = 0;
+        tid_t tid;
+        static int noisy = 5;
+        jfs_info("In jfs_commit_inode, inode = 0x%p", inode);
+        /*
+         * Don't commit if inode has been committed since last being
+         * marked dirty, or if it has been deleted.
+         */
+        if (inode->i_nlink == 0 || !test_cflag(COMMIT_Dirty, inode))
+                return 0;
+        if (isReadOnly(inode)) {
+                /* kernel allows writes to devices on read-only
+                 * partitions and may think inode is dirty
+                 */
+                if (!special_file(inode->i_mode) && noisy) {
+                        jfs_err("jfs_commit_inode(0x%p) called on "
+                                   "read-only volume", inode);
+                        jfs_err("Is remount racy?");
+                        noisy--;
+                }
+                return 0;
+        }
+        tid = txBegin(inode->i_sb, COMMIT_INODE);
+        down(&JFS_IP(inode)->commit_sem);
+        /*
+         * Retest inode state after taking commit_sem
+         */
+        if (inode->i_nlink && test_cflag(COMMIT_Dirty, inode))
+                rc = txCommit(tid, 1, &inode, wait ? COMMIT_SYNC : 0);
+        txEnd(tid);
+        up(&JFS_IP(inode)->commit_sem);
+        return rc;
+}
+int jfs_write_inode(struct inode *inode, int wait)
+{
+        if (test_cflag(COMMIT_Nolink, inode))
+                return 0;
+        /*
+         * If COMMIT_DIRTY is not set, the inode isn't really dirty.
+         * It has been committed since the last change, but was still
+         * on the dirty inode list.
+         */
+         if (!test_cflag(COMMIT_Dirty, inode)) {
+                /* Make sure committed changes hit the disk */
+                jfs_flush_journal(JFS_SBI(inode->i_sb)->log, wait);
+                return 0;
+         }
+        if (jfs_commit_inode(inode, wait)) {
+                jfs_err("jfs_write_inode: jfs_commit_inode failed!");
+                return -EIO;
+        } else
+                return 0;
+}
+void jfs_delete_inode(struct inode *inode)
+{
+        jfs_info("In jfs_delete_inode, inode = 0x%p", inode);
+        if (test_cflag(COMMIT_Freewmap, inode))
+                freeZeroLink(inode);
+        diFree(inode);
+        /*
+         * Free the inode from the quota allocation.
+         */
+        DQUOT_INIT(inode);
+        DQUOT_FREE_INODE(inode);
+        DQUOT_DROP(inode);
+        clear_inode(inode);
+}
+void jfs_dirty_inode(struct inode *inode)
+{
+        static int noisy = 5;
+        if (isReadOnly(inode)) {
+                if (!special_file(inode->i_mode) && noisy) {
+                        /* kernel allows writes to devices on read-only
+                         * partitions and may try to mark inode dirty
+                         */
+                        jfs_err("jfs_dirty_inode called on read-only volume");
+                        jfs_err("Is remount racy?");
+                        noisy--;
+                }
+                return;
+        }
+        set_cflag(COMMIT_Dirty, inode);
+}
+static int
+jfs_get_blocks(struct inode *ip, sector_t lblock, unsigned long max_blocks,
+                        struct buffer_head *bh_result, int create)
+{
+        s64 lblock64 = lblock;
+        int rc = 0;
+        int take_locks;
+        xad_t xad;
+        s64 xaddr;
+        int xflag;
+        s32 xlen;
+        /*
+         * If this is a special inode (imap, dmap)
+         * the lock should already be taken
+         */
+        take_locks = (JFS_IP(ip)->fileset != AGGREGATE_I);
+        /*
+         * Take appropriate lock on inode
+         */
+        if (take_locks) {
+                if (create)
+                        IWRITE_LOCK(ip);
+                else
+                        IREAD_LOCK(ip);
+        }
+        if (((lblock64 << ip->i_sb->s_blocksize_bits) < ip->i_size) &&
+            (xtLookup(ip, lblock64, max_blocks, &xflag, &xaddr, &xlen, 0)
+             == 0) && xlen) {
+                if (xflag & XAD_NOTRECORDED) {
+                        if (!create)
+                                /*
+                                 * Allocated but not recorded, read treats
+                                 * this as a hole
+                                 */
+                                goto unlock;
+#ifdef _JFS_4K
+                        XADoffset(&xad, lblock64);
+                        XADlength(&xad, xlen);
+                        XADaddress(&xad, xaddr);
+#else                           /* _JFS_4K */
+                        /*
+                         * As long as block size = 4K, this isn't a problem.
+                         * We should mark the whole page not ABNR, but how
+                         * will we know to mark the other blocks BH_New?
+                         */
+                        BUG();
+#endif                          /* _JFS_4K */
+                        rc = extRecord(ip, &xad);
+                        if (rc)
+                                goto unlock;
+                        set_buffer_new(bh_result);
+                }
+                map_bh(bh_result, ip->i_sb, xaddr);
+                bh_result->b_size = xlen << ip->i_blkbits;
+                goto unlock;
+        }
+        if (!create)
+                goto unlock;
+        /*
+         * Allocate a new block
+         */
+#ifdef _JFS_4K
+        if ((rc = extHint(ip, lblock64 << ip->i_sb->s_blocksize_bits, &xad)))
+                goto unlock;
+        rc = extAlloc(ip, max_blocks, lblock64, &xad, FALSE);
+        if (rc)
+                goto unlock;
+        set_buffer_new(bh_result);
+        map_bh(bh_result, ip->i_sb, addressXAD(&xad));
+        bh_result->b_size = lengthXAD(&xad) << ip->i_blkbits;
+#else                           /* _JFS_4K */
+        /*
+         * We need to do whatever it takes to keep all but the last buffers
+         * in 4K pages - see jfs_write.c
+         */
+        BUG();
+#endif                          /* _JFS_4K */
+      unlock:
+        /*
+         * Release lock on inode
+         */
+        if (take_locks) {
+                if (create)
+                        IWRITE_UNLOCK(ip);
+                else
+                        IREAD_UNLOCK(ip);
+        }
+        return rc;
+}
+static int jfs_get_block(struct inode *ip, sector_t lblock,
+                         struct buffer_head *bh_result, int create)
+{
+        return jfs_get_blocks(ip, lblock, 1, bh_result, create);
+}
+static int jfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+        return nobh_writepage(page, jfs_get_block, wbc);
+}
+static int jfs_writepages(struct address_space *mapping,
+                        struct writeback_control *wbc)
+{
+        return mpage_writepages(mapping, wbc, jfs_get_block);
+}
+static int jfs_readpage(struct file *file, struct page *page)
+{
+        return mpage_readpage(page, jfs_get_block);
+}
+static int jfs_readpages(struct file *file, struct address_space *mapping,
+                struct list_head *pages, unsigned nr_pages)
+{
+        return mpage_readpages(mapping, pages, nr_pages, jfs_get_block);
+}
+static int jfs_prepare_write(struct file *file,
+                             struct page *page, unsigned from, unsigned to)
+{
+        return nobh_prepare_write(page, from, to, jfs_get_block);
+}
+static sector_t jfs_bmap(struct address_space *mapping, sector_t block)
+{
+        return generic_block_bmap(mapping, block, jfs_get_block);
+}
+static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb,
+        const struct iovec *iov, loff_t offset, unsigned long nr_segs)
+{
+        struct file *file = iocb->ki_filp;
+        struct inode *inode = file->f_mapping->host;
+        return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+                                offset, nr_segs, jfs_get_blocks, NULL);
+}
+struct address_space_operations jfs_aops = {
+        .readpage       = jfs_readpage,
+        .readpages      = jfs_readpages,
+        .writepage      = jfs_writepage,
+        .writepages     = jfs_writepages,
+        .sync_page      = block_sync_page,
+        .prepare_write  = jfs_prepare_write,
+        .commit_write   = nobh_commit_write,
+        .bmap           = jfs_bmap,
+        .direct_IO      = jfs_direct_IO,
+};
+/*
+ * Guts of jfs_truncate.  Called with locks already held.  Can be called
+ * with directory for truncating directory index table.
+ */
+void jfs_truncate_nolock(struct inode *ip, loff_t length)
+{
+        loff_t newsize;
+        tid_t tid;
+        ASSERT(length >= 0);
+        if (test_cflag(COMMIT_Nolink, ip)) {
+                xtTruncate(0, ip, length, COMMIT_WMAP);
+                return;
+        }
+        do {
+                tid = txBegin(ip->i_sb, 0);
+                /*
+                 * The commit_sem cannot be taken before txBegin.
+                 * txBegin may block and there is a chance the inode
+                 * could be marked dirty and need to be committed
+                 * before txBegin unblocks
+                 */
+                down(&JFS_IP(ip)->commit_sem);
+                newsize = xtTruncate(tid, ip, length,
+                                     COMMIT_TRUNCATE | COMMIT_PWMAP);
+                if (newsize < 0) {
+                        txEnd(tid);
+                        up(&JFS_IP(ip)->commit_sem);
+                        break;
+                }
+                ip->i_mtime = ip->i_ctime = CURRENT_TIME;
+                mark_inode_dirty(ip);
+                txCommit(tid, 1, &ip, 0);
+                txEnd(tid);
+                up(&JFS_IP(ip)->commit_sem);
+        } while (newsize > length);     /* Truncate isn't always atomic */
+}
+void jfs_truncate(struct inode *ip)
+{
+        jfs_info("jfs_truncate: size = 0x%lx", (ulong) ip->i_size);
+        nobh_truncate_page(ip->i_mapping, ip->i_size);
+        IWRITE_LOCK(ip);
+        jfs_truncate_nolock(ip, ip->i_size);
+        IWRITE_UNLOCK(ip);
+}
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
new file mode 100644
index 000000000000..d2ae430adecf
--- /dev/null
+++ b/fs/jfs/jfs_acl.h
@@ -0,0 +1,30 @@
+/*
+ *   Copyright (c) International Business Machines  Corp., 2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_ACL
+#define _H_JFS_ACL
+#ifdef CONFIG_JFS_POSIX_ACL
+#include <linux/xattr_acl.h>
+int jfs_permission(struct inode *, int, struct nameidata *);
+int jfs_init_acl(struct inode *, struct inode *);
+int jfs_setattr(struct dentry *, struct iattr *);
+#endif          /* CONFIG_JFS_POSIX_ACL */
+#endif          /* _H_JFS_ACL */
diff --git a/fs/jfs/jfs_btree.h b/fs/jfs/jfs_btree.h
new file mode 100644
index 000000000000..7f3e9ac454ff
--- /dev/null
+++ b/fs/jfs/jfs_btree.h
@@ -0,0 +1,172 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_BTREE
+#define _H_JFS_BTREE
+/*
+ *      jfs_btree.h: B+-tree
+ *
+ * JFS B+-tree (dtree and xtree) common definitions
+ */
+/*
+ *      basic btree page - btpage
+ *
+struct btpage {
+        s64 next;               right sibling bn
+        s64 prev;               left sibling bn
+        u8 flag;
+        u8 rsrvd[7];            type specific
+        s64 self;               self address
+        u8 entry[4064];
+};                                              */
+/* btpaget_t flag */
+#define BT_TYPE         0x07    /* B+-tree index */
+#define BT_ROOT         0x01    /* root page */
+#define BT_LEAF         0x02    /* leaf page */
+#define BT_INTERNAL     0x04    /* internal page */
+#define BT_RIGHTMOST    0x10    /* rightmost page */
+#define BT_LEFTMOST     0x20    /* leftmost page */
+#define BT_SWAPPED      0x80    /* used by fsck for endian swapping */
+/* btorder (in inode) */
+#define BT_RANDOM               0x0000
+#define BT_SEQUENTIAL           0x0001
+#define BT_LOOKUP               0x0010
+#define BT_INSERT               0x0020
+#define BT_DELETE               0x0040
+/*
+ *      btree page buffer cache access
+ */
+#define BT_IS_ROOT(MP) (((MP)->xflag & COMMIT_PAGE) == 0)
+/* get page from buffer page */
+#define BT_PAGE(IP, MP, TYPE, ROOT)\
+        (BT_IS_ROOT(MP) ? (TYPE *)&JFS_IP(IP)->ROOT : (TYPE *)(MP)->data)
+/* get the page buffer and the page for specified block address */
+#define BT_GETPAGE(IP, BN, MP, TYPE, SIZE, P, RC, ROOT)\
+{\
+        if ((BN) == 0)\
+        {\
+                MP = (struct metapage *)&JFS_IP(IP)->bxflag;\
+                P = (TYPE *)&JFS_IP(IP)->ROOT;\
+                RC = 0;\
+        }\
+        else\
+        {\
+                MP = read_metapage((IP), BN, SIZE, 1);\
+                if (MP) {\
+                        RC = 0;\
+                        P = (MP)->data;\
+                } else {\
+                        P = NULL;\
+                        jfs_err("bread failed!");\
+                        RC = -EIO;\
+                }\
+        }\
+}
+#define BT_MARK_DIRTY(MP, IP)\
+{\
+        if (BT_IS_ROOT(MP))\
+                mark_inode_dirty(IP);\
+        else\
+                mark_metapage_dirty(MP);\
+}
+/* put the page buffer */
+#define BT_PUTPAGE(MP)\
+{\
+        if (! BT_IS_ROOT(MP)) \
+                release_metapage(MP); \
+}
+/*
+ *      btree traversal stack
+ *
+ * record the path traversed during the search;
+ * top frame record the leaf page/entry selected.
+ */
+struct btframe {        /* stack frame */
+        s64 bn;                 /* 8: */
+        s16 index;              /* 2: */
+        s16 lastindex;          /* 2: unused */
+        struct metapage *mp;    /* 4/8: */
+};                              /* (16/24) */
+struct btstack {
+        struct btframe *top;
+        int nsplit;
+        struct btframe stack[MAXTREEHEIGHT];
+};
+#define BT_CLR(btstack)\
+        (btstack)->top = (btstack)->stack
+#define BT_STACK_FULL(btstack)\
+        ( (btstack)->top == &((btstack)->stack[MAXTREEHEIGHT-1]))
+#define BT_PUSH(BTSTACK, BN, INDEX)\
+{\
+        assert(!BT_STACK_FULL(BTSTACK));\
+        (BTSTACK)->top->bn = BN;\
+        (BTSTACK)->top->index = INDEX;\
+        ++(BTSTACK)->top;\
+}
+#define BT_POP(btstack)\
+        ( (btstack)->top == (btstack)->stack ? NULL : --(btstack)->top )
+#define BT_STACK(btstack)\
+        ( (btstack)->top == (btstack)->stack ? NULL : (btstack)->top )
+static inline void BT_STACK_DUMP(struct btstack *btstack)
+{
+        int i;
+        printk("btstack dump:\n");
+        for (i = 0; i < MAXTREEHEIGHT; i++)
+                printk(KERN_ERR "bn = %Lx, index = %d\n",
+                       (long long)btstack->stack[i].bn,
+                       btstack->stack[i].index);
+}
+/* retrieve search results */
+#define BT_GETSEARCH(IP, LEAF, BN, MP, TYPE, P, INDEX, ROOT)\
+{\
+        BN = (LEAF)->bn;\
+        MP = (LEAF)->mp;\
+        if (BN)\
+                P = (TYPE *)MP->data;\
+        else\
+                P = (TYPE *)&JFS_IP(IP)->ROOT;\
+        INDEX = (LEAF)->index;\
+}
+/* put the page buffer of search */
+#define BT_PUTSEARCH(BTSTACK)\
+{\
+        if (! BT_IS_ROOT((BTSTACK)->top->mp))\
+                release_metapage((BTSTACK)->top->mp);\
+}
+#endif                          /* _H_JFS_BTREE */
diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c
new file mode 100644
index 000000000000..91a0a889ebc5
--- /dev/null
+++ b/fs/jfs/jfs_debug.c
@@ -0,0 +1,154 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *   Portions Copyright (C) Christoph Hellwig, 2001-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/fs.h>
+#include <linux/ctype.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <asm/uaccess.h>
+#include "jfs_incore.h"
+#include "jfs_filsys.h"
+#include "jfs_debug.h"
+#ifdef CONFIG_JFS_DEBUG
+void dump_mem(char *label, void *data, int length)
+{
+        int i, j;
+        int *intptr = data;
+        char *charptr = data;
+        char buf[10], line[80];
+        printk("%s: dump of %d bytes of data at 0x%p\n\n", label, length,
+               data);
+        for (i = 0; i < length; i += 16) {
+                line[0] = 0;
+                for (j = 0; (j < 4) && (i + j * 4 < length); j++) {
+                        sprintf(buf, " %08x", intptr[i / 4 + j]);
+                        strcat(line, buf);
+                }
+                buf[0] = ' ';
+                buf[2] = 0;
+                for (j = 0; (j < 16) && (i + j < length); j++) {
+                        buf[1] =
+                            isprint(charptr[i + j]) ? charptr[i + j] : '.';
+                        strcat(line, buf);
+                }
+                printk("%s\n", line);
+        }
+}
+#endif
+#ifdef PROC_FS_JFS /* see jfs_debug.h */
+static struct proc_dir_entry *base;
+#ifdef CONFIG_JFS_DEBUG
+extern read_proc_t jfs_txanchor_read;
+static int loglevel_read(char *page, char **start, off_t off,
+                         int count, int *eof, void *data)
+{
+        int len;
+        len = sprintf(page, "%d\n", jfsloglevel);
+        len -= off;
+        *start = page + off;
+        if (len > count)
+                len = count;
+        else
+                *eof = 1;
+        if (len < 0)
+                len = 0;
+        return len;
+}
+static int loglevel_write(struct file *file, const char __user *buffer,
+                        unsigned long count, void *data)
+{
+        char c;
+        if (get_user(c, buffer))
+                return -EFAULT;
+        /* yes, I know this is an ASCIIism.  --hch */
+        if (c < '0' || c > '9')
+                return -EINVAL;
+        jfsloglevel = c - '0';
+        return count;
+}
+#endif
+#ifdef CONFIG_JFS_STATISTICS
+extern read_proc_t jfs_lmstats_read;
+extern read_proc_t jfs_txstats_read;
+extern read_proc_t jfs_xtstat_read;
+extern read_proc_t jfs_mpstat_read;
+#endif
+static struct {
+        const char      *name;
+        read_proc_t     *read_fn;
+        write_proc_t    *write_fn;
+} Entries[] = {
+#ifdef CONFIG_JFS_STATISTICS
+        { "lmstats",    jfs_lmstats_read, },
+        { "txstats",    jfs_txstats_read, },
+        { "xtstat",     jfs_xtstat_read, },
+        { "mpstat",     jfs_mpstat_read, },
+#endif
+#ifdef CONFIG_JFS_DEBUG
+        { "TxAnchor",   jfs_txanchor_read, },
+        { "loglevel",   loglevel_read, loglevel_write }
+#endif
+};
+#define NPROCENT        (sizeof(Entries)/sizeof(Entries[0]))
+void jfs_proc_init(void)
+{
+        int i;
+        if (!(base = proc_mkdir("jfs", proc_root_fs)))
+                return;
+        base->owner = THIS_MODULE;
+        for (i = 0; i < NPROCENT; i++) {
+                struct proc_dir_entry *p;
+                if ((p = create_proc_entry(Entries[i].name, 0, base))) {
+                        p->read_proc = Entries[i].read_fn;
+                        p->write_proc = Entries[i].write_fn;
+                }
+        }
+}
+void jfs_proc_clean(void)
+{
+        int i;
+        if (base) {
+                for (i = 0; i < NPROCENT; i++)
+                        remove_proc_entry(Entries[i].name, base);
+                remove_proc_entry("jfs", proc_root_fs);
+        }
+}
+#endif /* PROC_FS_JFS */
diff --git a/fs/jfs/jfs_debug.h b/fs/jfs/jfs_debug.h
new file mode 100644
index 000000000000..a38079ae1e00
--- /dev/null
+++ b/fs/jfs/jfs_debug.h
@@ -0,0 +1,122 @@
+/*
+ *   Copyright (c) International Business Machines Corp., 2000-2002
+ *   Portions Copyright (c) Christoph Hellwig, 2001-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_DEBUG
+#define _H_JFS_DEBUG
+/*
+ *      jfs_debug.h
+ *
+ * global debug message, data structure/macro definitions
+ * under control of CONFIG_JFS_DEBUG, CONFIG_JFS_STATISTICS;
+ */
+/*
+ * Create /proc/fs/jfs if procfs is enabled andeither
+ * CONFIG_JFS_DEBUG or CONFIG_JFS_STATISTICS is defined
+ */
+#if defined(CONFIG_PROC_FS) && (defined(CONFIG_JFS_DEBUG) || defined(CONFIG_JFS_STATISTICS))
+        #define PROC_FS_JFS
+#endif
+/*
+ *      assert with traditional printf/panic
+ */
+#ifdef CONFIG_KERNEL_ASSERTS
+/* kgdb stuff */
+#define assert(p) KERNEL_ASSERT(#p, p)
+#else
+#define assert(p) do {  \
+        if (!(p)) {     \
+                printk(KERN_CRIT "BUG at %s:%d assert(%s)\n",   \
+                       __FILE__, __LINE__, #p);                 \
+                BUG();  \
+        }               \
+} while (0)
+#endif
+/*
+ *      debug ON
+ *      --------
+ */
+#ifdef CONFIG_JFS_DEBUG
+#define ASSERT(p) assert(p)
+/* printk verbosity */
+#define JFS_LOGLEVEL_ERR 1
+#define JFS_LOGLEVEL_WARN 2
+#define JFS_LOGLEVEL_DEBUG 3
+#define JFS_LOGLEVEL_INFO 4
+extern int jfsloglevel;
+/* dump memory contents */
+extern void dump_mem(char *label, void *data, int length);
+/* information message: e.g., configuration, major event */
+#define jfs_info(fmt, arg...) do {                      \
+        if (jfsloglevel >= JFS_LOGLEVEL_INFO)           \
+                printk(KERN_INFO fmt "\n", ## arg);     \
+} while (0)
+/* debug message: ad hoc */
+#define jfs_debug(fmt, arg...) do {                     \
+        if (jfsloglevel >= JFS_LOGLEVEL_DEBUG)          \
+                printk(KERN_DEBUG fmt "\n", ## arg);    \
+} while (0)
+/* warn message: */
+#define jfs_warn(fmt, arg...) do {                      \
+        if (jfsloglevel >= JFS_LOGLEVEL_WARN)           \
+                printk(KERN_WARNING fmt "\n", ## arg);  \
+} while (0)
+/* error event message: e.g., i/o error */
+#define jfs_err(fmt, arg...) do {                       \
+        if (jfsloglevel >= JFS_LOGLEVEL_ERR)            \
+                printk(KERN_ERR fmt "\n", ## arg);      \
+} while (0)
+/*
+ *      debug OFF
+ *      ---------
+ */
+#else                           /* CONFIG_JFS_DEBUG */
+#define dump_mem(label,data,length) do {} while (0)
+#define ASSERT(p) do {} while (0)
+#define jfs_info(fmt, arg...) do {} while (0)
+#define jfs_debug(fmt, arg...) do {} while (0)
+#define jfs_warn(fmt, arg...) do {} while (0)
+#define jfs_err(fmt, arg...) do {} while (0)
+#endif                          /* CONFIG_JFS_DEBUG */
+/*
+ *      statistics
+ *      ----------
+ */
+#ifdef  CONFIG_JFS_STATISTICS
+#define INCREMENT(x)            ((x)++)
+#define DECREMENT(x)            ((x)--)
+#define HIGHWATERMARK(x,y)      ((x) = max((x), (y)))
+#else
+#define INCREMENT(x)
+#define DECREMENT(x)
+#define HIGHWATERMARK(x,y)
+#endif                          /* CONFIG_JFS_STATISTICS */
+#endif                          /* _H_JFS_DEBUG */
diff --git a/fs/jfs/jfs_dinode.h b/fs/jfs/jfs_dinode.h
new file mode 100644
index 000000000000..580a3258449b
--- /dev/null
+++ b/fs/jfs/jfs_dinode.h
@@ -0,0 +1,151 @@
+/*
+ *   Copyright (c) International Business Machines Corp., 2000-2001
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_DINODE
+#define _H_JFS_DINODE
+/*
+ *      jfs_dinode.h: on-disk inode manager
+ */
+#define INODESLOTSIZE           128
+#define L2INODESLOTSIZE         7
+#define log2INODESIZE           9       /* log2(bytes per dinode) */
+/*
+ *      on-disk inode : 512 bytes
+ *
+ * note: align 64-bit fields on 8-byte boundary.
+ */
+struct dinode {
+        /*
+         *      I. base area (128 bytes)
+         *      ------------------------
+         *
+         * define generic/POSIX attributes
+         */
+        __le32 di_inostamp;     /* 4: stamp to show inode belongs to fileset */
+        __le32 di_fileset;      /* 4: fileset number */
+        __le32 di_number;       /* 4: inode number, aka file serial number */
+        __le32 di_gen;          /* 4: inode generation number */
+        pxd_t di_ixpxd;         /* 8: inode extent descriptor */
+        __le64 di_size;         /* 8: size */
+        __le64 di_nblocks;      /* 8: number of blocks allocated */
+        __le32 di_nlink;        /* 4: number of links to the object */
+        __le32 di_uid;          /* 4: user id of owner */
+        __le32 di_gid;          /* 4: group id of owner */
+        __le32 di_mode;         /* 4: attribute, format and permission */
+        struct timestruc_t di_atime;    /* 8: time last data accessed */
+        struct timestruc_t di_ctime;    /* 8: time last status changed */
+        struct timestruc_t di_mtime;    /* 8: time last data modified */
+        struct timestruc_t di_otime;    /* 8: time created */
+        dxd_t di_acl;           /* 16: acl descriptor */
+        dxd_t di_ea;            /* 16: ea descriptor */
+        __le32 di_next_index;   /* 4: Next available dir_table index */
+        __le32 di_acltype;      /* 4: Type of ACL */
+        /*
+         *      Extension Areas.
+         *
+         *      Historically, the inode was partitioned into 4 128-byte areas,
+         *      the last 3 being defined as unions which could have multiple
+         *      uses.  The first 96 bytes had been completely unused until
+         *      an index table was added to the directory.  It is now more
+         *      useful to describe the last 3/4 of the inode as a single
+         *      union.  We would probably be better off redesigning the
+         *      entire structure from scratch, but we don't want to break
+         *      commonality with OS/2's JFS at this time.
+         */
+        union {
+                struct {
+                        /*
+                         * This table contains the information needed to
+                         * find a directory entry from a 32-bit index.
+                         * If the index is small enough, the table is inline,
+                         * otherwise, an x-tree root overlays this table
+                         */
+                        struct dir_table_slot _table[12]; /* 96: inline */
+                        dtroot_t _dtroot;               /* 288: dtree root */
+                } _dir;                                 /* (384) */
+#define di_dirtable     u._dir._table
+#define di_dtroot       u._dir._dtroot
+#define di_parent       di_dtroot.header.idotdot
+#define di_DASD         di_dtroot.header.DASD
+                struct {
+                        union {
+                                u8 _data[96];           /* 96: unused */
+                                struct {
+                                        void *_imap;    /* 4: unused */
+                                        __le32 _gengen; /* 4: generator */
+                                } _imap;
+                        } _u1;                          /* 96: */
+#define di_gengen       u._file._u1._imap._gengen
+                        union {
+                                xtpage_t _xtroot;
+                                struct {
+                                        u8 unused[16];  /* 16: */
+                                        dxd_t _dxd;     /* 16: */
+                                        union {
+                                                __le32 _rdev;   /* 4: */
+                                                u8 _fastsymlink[128];
+                                        } _u;
+                                        u8 _inlineea[128];
+                                } _special;
+                        } _u2;
+                } _file;
+#define di_xtroot       u._file._u2._xtroot
+#define di_dxd          u._file._u2._special._dxd
+#define di_btroot       di_xtroot
+#define di_inlinedata   u._file._u2._special._u
+#define di_rdev         u._file._u2._special._u._rdev
+#define di_fastsymlink  u._file._u2._special._u._fastsymlink
+#define di_inlineea     u._file._u2._special._inlineea
+        } u;
+};
+/* extended mode bits (on-disk inode di_mode) */
+#define IFJOURNAL       0x00010000      /* journalled file */
+#define ISPARSE         0x00020000      /* sparse file enabled */
+#define INLINEEA        0x00040000      /* inline EA area free */
+#define ISWAPFILE       0x00800000      /* file open for pager swap space */
+/* more extended mode bits: attributes for OS/2 */
+#define IREADONLY       0x02000000      /* no write access to file */
+#define IARCHIVE        0x40000000      /* file archive bit */
+#define ISYSTEM         0x08000000      /* system file */
+#define IHIDDEN         0x04000000      /* hidden file */
+#define IRASH           0x4E000000      /* mask for changeable attributes */
+#define INEWNAME        0x80000000      /* non-8.3 filename format */
+#define IDIRECTORY      0x20000000      /* directory (shadow of real bit) */
+#define ATTRSHIFT       25      /* bits to shift to move attribute
+                                   specification to mode position */
+#endif /*_H_JFS_DINODE */
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
new file mode 100644
index 000000000000..d86e467c6e42
--- /dev/null
+++ b/fs/jfs/jfs_dmap.c
@@ -0,0 +1,4272 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/fs.h>
+#include "jfs_incore.h"
+#include "jfs_superblock.h"
+#include "jfs_dmap.h"
+#include "jfs_imap.h"
+#include "jfs_lock.h"
+#include "jfs_metapage.h"
+#include "jfs_debug.h"
+/*
+ *      Debug code for double-checking block map
+ */
+/* #define      _JFS_DEBUG_DMAP 1 */
+#ifdef  _JFS_DEBUG_DMAP
+#define DBINITMAP(size,ipbmap,results) \
+        DBinitmap(size,ipbmap,results)
+#define DBALLOC(dbmap,mapsize,blkno,nblocks) \
+        DBAlloc(dbmap,mapsize,blkno,nblocks)
+#define DBFREE(dbmap,mapsize,blkno,nblocks) \
+        DBFree(dbmap,mapsize,blkno,nblocks)
+#define DBALLOCCK(dbmap,mapsize,blkno,nblocks) \
+        DBAllocCK(dbmap,mapsize,blkno,nblocks)
+#define DBFREECK(dbmap,mapsize,blkno,nblocks) \
+        DBFreeCK(dbmap,mapsize,blkno,nblocks)
+static void DBinitmap(s64, struct inode *, u32 **);
+static void DBAlloc(uint *, s64, s64, s64);
+static void DBFree(uint *, s64, s64, s64);
+static void DBAllocCK(uint *, s64, s64, s64);
+static void DBFreeCK(uint *, s64, s64, s64);
+#else
+#define DBINITMAP(size,ipbmap,results)
+#define DBALLOC(dbmap, mapsize, blkno, nblocks)
+#define DBFREE(dbmap, mapsize, blkno, nblocks)
+#define DBALLOCCK(dbmap, mapsize, blkno, nblocks)
+#define DBFREECK(dbmap, mapsize, blkno, nblocks)
+#endif                          /* _JFS_DEBUG_DMAP */
+/*
+ *      SERIALIZATION of the Block Allocation Map.
+ *
+ *      the working state of the block allocation map is accessed in
+ *      two directions:
+ *      
+ *      1) allocation and free requests that start at the dmap
+ *         level and move up through the dmap control pages (i.e.
+ *         the vast majority of requests).
+ * 
+ *      2) allocation requests that start at dmap control page
+ *         level and work down towards the dmaps.
+ *      
+ *      the serialization scheme used here is as follows. 
+ *
+ *      requests which start at the bottom are serialized against each 
+ *      other through buffers and each requests holds onto its buffers 
+ *      as it works it way up from a single dmap to the required level 
+ *      of dmap control page.
+ *      requests that start at the top are serialized against each other
+ *      and request that start from the bottom by the multiple read/single
+ *      write inode lock of the bmap inode. requests starting at the top
+ *      take this lock in write mode while request starting at the bottom
+ *      take the lock in read mode.  a single top-down request may proceed
+ *      exclusively while multiple bottoms-up requests may proceed 
+ *      simultaneously (under the protection of busy buffers).
+ *      
+ *      in addition to information found in dmaps and dmap control pages,
+ *      the working state of the block allocation map also includes read/
+ *      write information maintained in the bmap descriptor (i.e. total
+ *      free block count, allocation group level free block counts).
+ *      a single exclusive lock (BMAP_LOCK) is used to guard this information
+ *      in the face of multiple-bottoms up requests.
+ *      (lock ordering: IREAD_LOCK, BMAP_LOCK);
+ *      
+ *      accesses to the persistent state of the block allocation map (limited
+ *      to the persistent bitmaps in dmaps) is guarded by (busy) buffers.
+ */
+#define BMAP_LOCK_INIT(bmp)     init_MUTEX(&bmp->db_bmaplock)
+#define BMAP_LOCK(bmp)          down(&bmp->db_bmaplock)
+#define BMAP_UNLOCK(bmp)        up(&bmp->db_bmaplock)
+/*
+ * forward references
+ */
+static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
+                        int nblocks);
+static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval);
+static void dbBackSplit(dmtree_t * tp, int leafno);
+static void dbJoin(dmtree_t * tp, int leafno, int newval);
+static void dbAdjTree(dmtree_t * tp, int leafno, int newval);
+static int dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc,
+                    int level);
+static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results);
+static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno,
+                       int nblocks);
+static int dbAllocNear(struct bmap * bmp, struct dmap * dp, s64 blkno,
+                       int nblocks,
+                       int l2nb, s64 * results);
+static int dbAllocDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
+                       int nblocks);
+static int dbAllocDmapLev(struct bmap * bmp, struct dmap * dp, int nblocks,
+                          int l2nb,
+                          s64 * results);
+static int dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb,
+                     s64 * results);
+static int dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno,
+                      s64 * results);
+static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks);
+static int dbFindBits(u32 word, int l2nb);
+static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno);
+static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx);
+static void dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
+                       int nblocks);
+static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
+                      int nblocks);
+static int dbMaxBud(u8 * cp);
+s64 dbMapFileSizeToMapSize(struct inode *ipbmap);
+static int blkstol2(s64 nb);
+static int cntlz(u32 value);
+static int cnttz(u32 word);
+static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno,
+                         int nblocks);
+static int dbInitDmap(struct dmap * dp, s64 blkno, int nblocks);
+static int dbInitDmapTree(struct dmap * dp);
+static int dbInitTree(struct dmaptree * dtp);
+static int dbInitDmapCtl(struct dmapctl * dcp, int level, int i);
+static int dbGetL2AGSize(s64 nblocks);
+/*
+ *      buddy table
+ *
+ * table used for determining buddy sizes within characters of 
+ * dmap bitmap words.  the characters themselves serve as indexes
+ * into the table, with the table elements yielding the maximum
+ * binary buddy of free bits within the character.
+ */
+static s8 budtab[256] = {
+        3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+        2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+        2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+        2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+        2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+        2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+        2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+        2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+        2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+        2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, -1
+};
+/*
+ * NAME:        dbMount()
+ *
+ * FUNCTION:    initializate the block allocation map.
+ *
+ *              memory is allocated for the in-core bmap descriptor and
+ *              the in-core descriptor is initialized from disk.
+ *
+ * PARAMETERS:
+ *      ipbmap  -  pointer to in-core inode for the block map.
+ *
+ * RETURN VALUES:
+ *      0       - success
+ *      -ENOMEM - insufficient memory
+ *      -EIO    - i/o error
+ */
+int dbMount(struct inode *ipbmap)
+{
+        struct bmap *bmp;
+        struct dbmap_disk *dbmp_le;
+        struct metapage *mp;
+        int i;
+        /*
+         * allocate/initialize the in-memory bmap descriptor
+         */
+        /* allocate memory for the in-memory bmap descriptor */
+        bmp = kmalloc(sizeof(struct bmap), GFP_KERNEL);
+        if (bmp == NULL)
+                return -ENOMEM;
+        /* read the on-disk bmap descriptor. */
+        mp = read_metapage(ipbmap,
+                           BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage,
+                           PSIZE, 0);
+        if (mp == NULL) {
+                kfree(bmp);
+                return -EIO;
+        }
+        /* copy the on-disk bmap descriptor to its in-memory version. */
+        dbmp_le = (struct dbmap_disk *) mp->data;
+        bmp->db_mapsize = le64_to_cpu(dbmp_le->dn_mapsize);
+        bmp->db_nfree = le64_to_cpu(dbmp_le->dn_nfree);
+        bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage);
+        bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag);
+        bmp->db_maxlevel = le32_to_cpu(dbmp_le->dn_maxlevel);
+        bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag);
+        bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref);
+        bmp->db_aglevel = le32_to_cpu(dbmp_le->dn_aglevel);
+        bmp->db_agheigth = le32_to_cpu(dbmp_le->dn_agheigth);
+        bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth);
+        bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart);
+        bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size);
+        for (i = 0; i < MAXAG; i++)
+                bmp->db_agfree[i] = le64_to_cpu(dbmp_le->dn_agfree[i]);
+        bmp->db_agsize = le64_to_cpu(dbmp_le->dn_agsize);
+        bmp->db_maxfreebud = dbmp_le->dn_maxfreebud;
+        /* release the buffer. */
+        release_metapage(mp);
+        /* bind the bmap inode and the bmap descriptor to each other. */
+        bmp->db_ipbmap = ipbmap;
+        JFS_SBI(ipbmap->i_sb)->bmap = bmp;
+        memset(bmp->db_active, 0, sizeof(bmp->db_active));
+        DBINITMAP(bmp->db_mapsize, ipbmap, &bmp->db_DBmap);
+        /*
+         * allocate/initialize the bmap lock
+         */
+        BMAP_LOCK_INIT(bmp);
+        return (0);
+}
+/*
+ * NAME:        dbUnmount()
+ *
+ * FUNCTION:    terminate the block allocation map in preparation for
+ *              file system unmount.
+ *
+ *              the in-core bmap descriptor is written to disk and
+ *              the memory for this descriptor is freed.
+ *
+ * PARAMETERS:
+ *      ipbmap  -  pointer to in-core inode for the block map.
+ *
+ * RETURN VALUES:
+ *      0       - success
+ *      -EIO    - i/o error
+ */
+int dbUnmount(struct inode *ipbmap, int mounterror)
+{
+        struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
+        int i;
+        if (!(mounterror || isReadOnly(ipbmap)))
+                dbSync(ipbmap);
+        /*
+         * Invalidate the page cache buffers
+         */
+        truncate_inode_pages(ipbmap->i_mapping, 0);
+        /*
+         * Sanity Check
+         */
+        for (i = 0; i < bmp->db_numag; i++)
+                if (atomic_read(&bmp->db_active[i]))
+                        printk(KERN_ERR "dbUnmount: db_active[%d] = %d\n",
+                               i, atomic_read(&bmp->db_active[i]));
+        /* free the memory for the in-memory bmap. */
+        kfree(bmp);
+        return (0);
+}
+/*
+ *      dbSync()
+ */
+int dbSync(struct inode *ipbmap)
+{
+        struct dbmap_disk *dbmp_le;
+        struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
+        struct metapage *mp;
+        int i;
+        /*
+         * write bmap global control page
+         */
+        /* get the buffer for the on-disk bmap descriptor. */
+        mp = read_metapage(ipbmap,
+                           BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage,
+                           PSIZE, 0);
+        if (mp == NULL) {
+                jfs_err("dbSync: read_metapage failed!");
+                return -EIO;
+        }
+        /* copy the in-memory version of the bmap to the on-disk version */
+        dbmp_le = (struct dbmap_disk *) mp->data;
+        dbmp_le->dn_mapsize = cpu_to_le64(bmp->db_mapsize);
+        dbmp_le->dn_nfree = cpu_to_le64(bmp->db_nfree);
+        dbmp_le->dn_l2nbperpage = cpu_to_le32(bmp->db_l2nbperpage);
+        dbmp_le->dn_numag = cpu_to_le32(bmp->db_numag);
+        dbmp_le->dn_maxlevel = cpu_to_le32(bmp->db_maxlevel);
+        dbmp_le->dn_maxag = cpu_to_le32(bmp->db_maxag);
+        dbmp_le->dn_agpref = cpu_to_le32(bmp->db_agpref);
+        dbmp_le->dn_aglevel = cpu_to_le32(bmp->db_aglevel);
+        dbmp_le->dn_agheigth = cpu_to_le32(bmp->db_agheigth);
+        dbmp_le->dn_agwidth = cpu_to_le32(bmp->db_agwidth);
+        dbmp_le->dn_agstart = cpu_to_le32(bmp->db_agstart);
+        dbmp_le->dn_agl2size = cpu_to_le32(bmp->db_agl2size);
+        for (i = 0; i < MAXAG; i++)
+                dbmp_le->dn_agfree[i] = cpu_to_le64(bmp->db_agfree[i]);
+        dbmp_le->dn_agsize = cpu_to_le64(bmp->db_agsize);
+        dbmp_le->dn_maxfreebud = bmp->db_maxfreebud;
+        /* write the buffer */
+        write_metapage(mp);
+        /*
+         * write out dirty pages of bmap
+         */
+        filemap_fdatawrite(ipbmap->i_mapping);
+        filemap_fdatawait(ipbmap->i_mapping);
+        ipbmap->i_state |= I_DIRTY;
+        diWriteSpecial(ipbmap, 0);
+        return (0);
+}
+/*
+ * NAME:        dbFree()
+ *
+ * FUNCTION:    free the specified block range from the working block
+ *              allocation map.
+ *
+ *              the blocks will be free from the working map one dmap
+ *              at a time.
+ *
+ * PARAMETERS:
+ *      ip      -  pointer to in-core inode;
+ *      blkno   -  starting block number to be freed.
+ *      nblocks -  number of blocks to be freed.
+ *
+ * RETURN VALUES:
+ *      0       - success
+ *      -EIO    - i/o error
+ */
+int dbFree(struct inode *ip, s64 blkno, s64 nblocks)
+{
+        struct metapage *mp;
+        struct dmap *dp;
+        int nb, rc;
+        s64 lblkno, rem;
+        struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
+        struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
+        IREAD_LOCK(ipbmap);
+        /* block to be freed better be within the mapsize. */
+        if (unlikely((blkno == 0) || (blkno + nblocks > bmp->db_mapsize))) {
+                IREAD_UNLOCK(ipbmap);
+                printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n",
+                       (unsigned long long) blkno,
+                       (unsigned long long) nblocks);
+                jfs_error(ip->i_sb,
+                          "dbFree: block to be freed is outside the map");
+                return -EIO;
+        }
+        /*
+         * free the blocks a dmap at a time.
+         */
+        mp = NULL;
+        for (rem = nblocks; rem > 0; rem -= nb, blkno += nb) {
+                /* release previous dmap if any */
+                if (mp) {
+                        write_metapage(mp);
+                }
+                /* get the buffer for the current dmap. */
+                lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
+                mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
+                if (mp == NULL) {
+                        IREAD_UNLOCK(ipbmap);
+                        return -EIO;
+                }
+                dp = (struct dmap *) mp->data;
+                /* determine the number of blocks to be freed from
+                 * this dmap.
+                 */
+                nb = min(rem, BPERDMAP - (blkno & (BPERDMAP - 1)));
+                DBALLOCCK(bmp->db_DBmap, bmp->db_mapsize, blkno, nb);
+                /* free the blocks. */
+                if ((rc = dbFreeDmap(bmp, dp, blkno, nb))) {
+                        release_metapage(mp);
+                        IREAD_UNLOCK(ipbmap);
+                        return (rc);
+                }
+                DBFREE(bmp->db_DBmap, bmp->db_mapsize, blkno, nb);
+        }
+        /* write the last buffer. */
+        write_metapage(mp);
+        IREAD_UNLOCK(ipbmap);
+        return (0);
+}
+/*
+ * NAME:        dbUpdatePMap()
+ *
+ * FUNCTION:    update the allocation state (free or allocate) of the
+ *              specified block range in the persistent block allocation map.
+ *              
+ *              the blocks will be updated in the persistent map one
+ *              dmap at a time.
+ *
+ * PARAMETERS:
+ *      ipbmap  -  pointer to in-core inode for the block map.
+ *      free    - TRUE if block range is to be freed from the persistent
+ *                map; FALSE if it is to   be allocated.
+ *      blkno   -  starting block number of the range.
+ *      nblocks -  number of contiguous blocks in the range.
+ *      tblk    -  transaction block;
+ *
+ * RETURN VALUES:
+ *      0       - success
+ *      -EIO    - i/o error
+ */
+int
+dbUpdatePMap(struct inode *ipbmap,
+             int free, s64 blkno, s64 nblocks, struct tblock * tblk)
+{
+        int nblks, dbitno, wbitno, rbits;
+        int word, nbits, nwords;
+        struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
+        s64 lblkno, rem, lastlblkno;
+        u32 mask;
+        struct dmap *dp;
+        struct metapage *mp;
+        struct jfs_log *log;
+        int lsn, difft, diffp;
+        /* the blocks better be within the mapsize. */
+        if (blkno + nblocks > bmp->db_mapsize) {
+                printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n",
+                       (unsigned long long) blkno,
+                       (unsigned long long) nblocks);
+                jfs_error(ipbmap->i_sb,
+                          "dbUpdatePMap: blocks are outside the map");
+                return -EIO;
+        }
+        /* compute delta of transaction lsn from log syncpt */
+        lsn = tblk->lsn;
+        log = (struct jfs_log *) JFS_SBI(tblk->sb)->log;
+        logdiff(difft, lsn, log);
+        /*
+         * update the block state a dmap at a time.
+         */
+        mp = NULL;
+        lastlblkno = 0;
+        for (rem = nblocks; rem > 0; rem -= nblks, blkno += nblks) {
+                /* get the buffer for the current dmap. */
+                lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
+                if (lblkno != lastlblkno) {
+                        if (mp) {
+                                write_metapage(mp);
+                        }
+                        mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE,
+                                           0);
+                        if (mp == NULL)
+                                return -EIO;
+                }
+                dp = (struct dmap *) mp->data;
+                /* determine the bit number and word within the dmap of
+                 * the starting block.  also determine how many blocks
+                 * are to be updated within this dmap.
+                 */
+                dbitno = blkno & (BPERDMAP - 1);
+                word = dbitno >> L2DBWORD;
+                nblks = min(rem, (s64)BPERDMAP - dbitno);
+                /* update the bits of the dmap words. the first and last
+                 * words may only have a subset of their bits updated. if
+                 * this is the case, we'll work against that word (i.e.
+                 * partial first and/or last) only in a single pass.  a 
+                 * single pass will also be used to update all words that
+                 * are to have all their bits updated.
+                 */
+                for (rbits = nblks; rbits > 0;
+                     rbits -= nbits, dbitno += nbits) {
+                        /* determine the bit number within the word and
+                         * the number of bits within the word.
+                         */
+                        wbitno = dbitno & (DBWORD - 1);
+                        nbits = min(rbits, DBWORD - wbitno);
+                        /* check if only part of the word is to be updated. */
+                        if (nbits < DBWORD) {
+                                /* update (free or allocate) the bits
+                                 * in this word.
+                                 */
+                                mask =
+                                    (ONES << (DBWORD - nbits) >> wbitno);
+                                if (free)
+                                        dp->pmap[word] &=
+                                            cpu_to_le32(~mask);
+                                else
+                                        dp->pmap[word] |=
+                                            cpu_to_le32(mask);
+                                word += 1;
+                        } else {
+                                /* one or more words are to have all
+                                 * their bits updated.  determine how
+                                 * many words and how many bits.
+                                 */
+                                nwords = rbits >> L2DBWORD;
+                                nbits = nwords << L2DBWORD;
+                                /* update (free or allocate) the bits
+                                 * in these words.
+                                 */
+                                if (free)
+                                        memset(&dp->pmap[word], 0,
+                                               nwords * 4);
+                                else
+                                        memset(&dp->pmap[word], (int) ONES,
+                                               nwords * 4);
+                                word += nwords;
+                        }
+                }
+                /*
+                 * update dmap lsn
+                 */
+                if (lblkno == lastlblkno)
+                        continue;
+                lastlblkno = lblkno;
+                if (mp->lsn != 0) {
+                        /* inherit older/smaller lsn */
+                        logdiff(diffp, mp->lsn, log);
+                        if (difft < diffp) {
+                                mp->lsn = lsn;
+                                /* move bp after tblock in logsync list */
+                                LOGSYNC_LOCK(log);
+                                list_move(&mp->synclist, &tblk->synclist);
+                                LOGSYNC_UNLOCK(log);
+                        }
+                        /* inherit younger/larger clsn */
+                        LOGSYNC_LOCK(log);
+                        logdiff(difft, tblk->clsn, log);
+                        logdiff(diffp, mp->clsn, log);
+                        if (difft > diffp)
+                                mp->clsn = tblk->clsn;
+                        LOGSYNC_UNLOCK(log);
+                } else {
+                        mp->log = log;
+                        mp->lsn = lsn;
+                        /* insert bp after tblock in logsync list */
+                        LOGSYNC_LOCK(log);
+                        log->count++;
+                        list_add(&mp->synclist, &tblk->synclist);
+                        mp->clsn = tblk->clsn;
+                        LOGSYNC_UNLOCK(log);
+                }
+        }
+        /* write the last buffer. */
+        if (mp) {
+                write_metapage(mp);
+        }
+        return (0);
+}
+/*
+ * NAME:        dbNextAG()
+ *
+ * FUNCTION:    find the preferred allocation group for new allocations.
+ *
+ *              Within the allocation groups, we maintain a preferred
+ *              allocation group which consists of a group with at least
+ *              average free space.  It is the preferred group that we target
+ *              new inode allocation towards.  The tie-in between inode
+ *              allocation and block allocation occurs as we allocate the
+ *              first (data) block of an inode and specify the inode (block)
+ *              as the allocation hint for this block.
+ *
+ *              We try to avoid having more than one open file growing in
+ *              an allocation group, as this will lead to fragmentation.
+ *              This differs from the old OS/2 method of trying to keep
+ *              empty ags around for large allocations.
+ *
+ * PARAMETERS:
+ *      ipbmap  -  pointer to in-core inode for the block map.
+ *
+ * RETURN VALUES:
+ *      the preferred allocation group number.
+ */
+int dbNextAG(struct inode *ipbmap)
+{
+        s64 avgfree;
+        int agpref;
+        s64 hwm = 0;
+        int i;
+        int next_best = -1;
+        struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
+        BMAP_LOCK(bmp);
+        /* determine the average number of free blocks within the ags. */
+        avgfree = (u32)bmp->db_nfree / bmp->db_numag;
+        /*
+         * if the current preferred ag does not have an active allocator
+         * and has at least average freespace, return it
+         */
+        agpref = bmp->db_agpref;
+        if ((atomic_read(&bmp->db_active[agpref]) == 0) &&
+            (bmp->db_agfree[agpref] >= avgfree))
+                goto unlock;
+        /* From the last preferred ag, find the next one with at least
+         * average free space.
+         */
+        for (i = 0 ; i < bmp->db_numag; i++, agpref++) {
+                if (agpref == bmp->db_numag)
+                        agpref = 0;
+                if (atomic_read(&bmp->db_active[agpref]))
+                        /* open file is currently growing in this ag */
+                        continue;
+                if (bmp->db_agfree[agpref] >= avgfree) {
+                        /* Return this one */
+                        bmp->db_agpref = agpref;
+                        goto unlock;
+                } else if (bmp->db_agfree[agpref] > hwm) {
+                        /* Less than avg. freespace, but best so far */
+                        hwm = bmp->db_agfree[agpref];
+                        next_best = agpref;
+                }
+        }
+        /*
+         * If no inactive ag was found with average freespace, use the
+         * next best
+         */
+        if (next_best != -1)
+                bmp->db_agpref = next_best;
+        /* else leave db_agpref unchanged */
+unlock:
+        BMAP_UNLOCK(bmp);
+        /* return the preferred group.
+         */
+        return (bmp->db_agpref);
+}
+/*
+ * NAME:        dbAlloc()
+ *
+ * FUNCTION:    attempt to allocate a specified number of contiguous free
+ *              blocks from the working allocation block map.
+ *
+ *              the block allocation policy uses hints and a multi-step
+ *              approach.
+ *
+ *              for allocation requests smaller than the number of blocks
+ *              per dmap, we first try to allocate the new blocks
+ *              immediately following the hint.  if these blocks are not
+ *              available, we try to allocate blocks near the hint.  if
+ *              no blocks near the hint are available, we next try to 
+ *              allocate within the same dmap as contains the hint.
+ *
+ *              if no blocks are available in the dmap or the allocation
+ *              request is larger than the dmap size, we try to allocate
+ *              within the same allocation group as contains the hint. if
+ *              this does not succeed, we finally try to allocate anywhere
+ *              within the aggregate.
+ *
+ *              we also try to allocate anywhere within the aggregate for
+ *              for allocation requests larger than the allocation group
+ *              size or requests that specify no hint value.
+ *
+ * PARAMETERS:
+ *      ip      -  pointer to in-core inode;
+ *      hint    - allocation hint.
+ *      nblocks - number of contiguous blocks in the range.
+ *      results - on successful return, set to the starting block number
+ *                of the newly allocated contiguous range.
+ *
+ * RETURN VALUES:
+ *      0       - success
+ *      -ENOSPC - insufficient disk resources
+ *      -EIO    - i/o error
+ */
+int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
+{
+        int rc, agno;
+        struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
+        struct bmap *bmp;
+        struct metapage *mp;
+        s64 lblkno, blkno;
+        struct dmap *dp;
+        int l2nb;
+        s64 mapSize;
+        int writers;
+        /* assert that nblocks is valid */
+        assert(nblocks > 0);
+#ifdef _STILL_TO_PORT
+        /* DASD limit check                                     F226941 */
+        if (OVER_LIMIT(ip, nblocks))
+                return -ENOSPC;
+#endif                          /* _STILL_TO_PORT */
+        /* get the log2 number of blocks to be allocated.
+         * if the number of blocks is not a log2 multiple, 
+         * it will be rounded up to the next log2 multiple.
+         */
+        l2nb = BLKSTOL2(nblocks);
+        bmp = JFS_SBI(ip->i_sb)->bmap;
+//retry:        /* serialize w.r.t.extendfs() */
+        mapSize = bmp->db_mapsize;
+        /* the hint should be within the map */
+        if (hint >= mapSize) {
+                jfs_error(ip->i_sb, "dbAlloc: the hint is outside the map");
+                return -EIO;
+        }
+        /* if the number of blocks to be allocated is greater than the
+         * allocation group size, try to allocate anywhere.
+         */
+        if (l2nb > bmp->db_agl2size) {
+                IWRITE_LOCK(ipbmap);
+                rc = dbAllocAny(bmp, nblocks, l2nb, results);
+                if (rc == 0) {
+                        DBALLOC(bmp->db_DBmap, bmp->db_mapsize, *results,
+                                nblocks);
+                }
+                goto write_unlock;
+        }
+        /*
+         * If no hint, let dbNextAG recommend an allocation group
+         */
+        if (hint == 0)
+                goto pref_ag;
+        /* we would like to allocate close to the hint.  adjust the
+         * hint to the block following the hint since the allocators
+         * will start looking for free space starting at this point.
+         */
+        blkno = hint + 1;
+        if (blkno >= bmp->db_mapsize)
+                goto pref_ag;
+        agno = blkno >> bmp->db_agl2size;
+        /* check if blkno crosses over into a new allocation group.
+         * if so, check if we should allow allocations within this
+         * allocation group.
+         */
+        if ((blkno & (bmp->db_agsize - 1)) == 0)
+                /* check if the AG is currenly being written to.
+                 * if so, call dbNextAG() to find a non-busy
+                 * AG with sufficient free space.
+                 */
+                if (atomic_read(&bmp->db_active[agno]))
+                        goto pref_ag;
+        /* check if the allocation request size can be satisfied from a
+         * single dmap.  if so, try to allocate from the dmap containing
+         * the hint using a tiered strategy.
+         */
+        if (nblocks <= BPERDMAP) {
+                IREAD_LOCK(ipbmap);
+                /* get the buffer for the dmap containing the hint.
+                 */
+                rc = -EIO;
+                lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
+                mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
+                if (mp == NULL)
+                        goto read_unlock;
+                dp = (struct dmap *) mp->data;
+                /* first, try to satisfy the allocation request with the
+                 * blocks beginning at the hint.
+                 */
+                if ((rc = dbAllocNext(bmp, dp, blkno, (int) nblocks))
+                    != -ENOSPC) {
+                        if (rc == 0) {
+                                *results = blkno;
+                                DBALLOC(bmp->db_DBmap, bmp->db_mapsize,
+                                        *results, nblocks);
+                                mark_metapage_dirty(mp);
+                        }
+                        release_metapage(mp);
+                        goto read_unlock;
+                }
+                writers = atomic_read(&bmp->db_active[agno]);
+                if ((writers > 1) ||
+                    ((writers == 1) && (JFS_IP(ip)->active_ag != agno))) {
+                        /*
+                         * Someone else is writing in this allocation
+                         * group.  To avoid fragmenting, try another ag
+                         */
+                        release_metapage(mp);
+                        IREAD_UNLOCK(ipbmap);
+                        goto pref_ag;
+                }
+                /* next, try to satisfy the allocation request with blocks
+                 * near the hint.
+                 */
+                if ((rc =
+                     dbAllocNear(bmp, dp, blkno, (int) nblocks, l2nb, results))
+                    != -ENOSPC) {
+                        if (rc == 0) {
+                                DBALLOC(bmp->db_DBmap, bmp->db_mapsize,
+                                        *results, nblocks);
+                                mark_metapage_dirty(mp);
+                        }
+                        release_metapage(mp);
+                        goto read_unlock;
+                }
+                /* try to satisfy the allocation request with blocks within
+                 * the same dmap as the hint.
+                 */
+                if ((rc = dbAllocDmapLev(bmp, dp, (int) nblocks, l2nb, results))
+                    != -ENOSPC) {
+                        if (rc == 0) {
+                                DBALLOC(bmp->db_DBmap, bmp->db_mapsize,
+                                        *results, nblocks);
+                                mark_metapage_dirty(mp);
+                        }
+                        release_metapage(mp);
+                        goto read_unlock;
+                }
+                release_metapage(mp);
+                IREAD_UNLOCK(ipbmap);
+        }
+        /* try to satisfy the allocation request with blocks within
+         * the same allocation group as the hint.
+         */
+        IWRITE_LOCK(ipbmap);
+        if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results))
+            != -ENOSPC) {
+                if (rc == 0)
+                        DBALLOC(bmp->db_DBmap, bmp->db_mapsize,
+                                *results, nblocks);
+                goto write_unlock;
+        }
+        IWRITE_UNLOCK(ipbmap);
+      pref_ag:
+        /*
+         * Let dbNextAG recommend a preferred allocation group
+         */
+        agno = dbNextAG(ipbmap);
+        IWRITE_LOCK(ipbmap);
+        /* Try to allocate within this allocation group.  if that fails, try to
+         * allocate anywhere in the map.
+         */
+        if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results)) == -ENOSPC)
+                rc = dbAllocAny(bmp, nblocks, l2nb, results);
+        if (rc == 0) {
+                DBALLOC(bmp->db_DBmap, bmp->db_mapsize, *results, nblocks);
+        }
+      write_unlock:
+        IWRITE_UNLOCK(ipbmap);
+        return (rc);
+      read_unlock:
+        IREAD_UNLOCK(ipbmap);
+        return (rc);
+}
+#ifdef _NOTYET
+/*
+ * NAME:        dbAllocExact()
+ *
+ * FUNCTION:    try to allocate the requested extent;
+ *
+ * PARAMETERS:
+ *      ip      - pointer to in-core inode;
+ *      blkno   - extent address;
+ *      nblocks - extent length;
+ *
+ * RETURN VALUES:
+ *      0       - success
+ *      -ENOSPC - insufficient disk resources
+ *      -EIO    - i/o error
+ */
+int dbAllocExact(struct inode *ip, s64 blkno, int nblocks)
+{
+        int rc;
+        struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
+        struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
+        struct dmap *dp;
+        s64 lblkno;
+        struct metapage *mp;
+        IREAD_LOCK(ipbmap);
+        /*
+         * validate extent request:
+         *
+         * note: defragfs policy:
+         *  max 64 blocks will be moved.  
+         *  allocation request size must be satisfied from a single dmap.
+         */
+        if (nblocks <= 0 || nblocks > BPERDMAP || blkno >= bmp->db_mapsize) {
+                IREAD_UNLOCK(ipbmap);
+                return -EINVAL;
+        }
+        if (nblocks > ((s64) 1 << bmp->db_maxfreebud)) {
+                /* the free space is no longer available */
+                IREAD_UNLOCK(ipbmap);
+                return -ENOSPC;
+        }
+        /* read in the dmap covering the extent */
+        lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
+        mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
+        if (mp == NULL) {
+                IREAD_UNLOCK(ipbmap);
+                return -EIO;
+        }
+        dp = (struct dmap *) mp->data;
+        /* try to allocate the requested extent */
+        rc = dbAllocNext(bmp, dp, blkno, nblocks);
+        IREAD_UNLOCK(ipbmap);
+        if (rc == 0) {
+                DBALLOC(bmp->db_DBmap, bmp->db_mapsize, blkno, nblocks);
+                mark_metapage_dirty(mp);
+        }
+        release_metapage(mp);
+        return (rc);
+}
+#endif /* _NOTYET */
+/*
+ * NAME:        dbReAlloc()
+ *
+ * FUNCTION:    attempt to extend a current allocation by a specified
+ *              number of blocks.
+ *
+ *              this routine attempts to satisfy the allocation request
+ *              by first trying to extend the existing allocation in
+ *              place by allocating the additional blocks as the blocks
+ *              immediately following the current allocation.  if these
+ *              blocks are not available, this routine will attempt to
+ *              allocate a new set of contiguous blocks large enough
+ *              to cover the existing allocation plus the additional
+ *              number of blocks required.
+ *
+ * PARAMETERS:
+ *      ip          -  pointer to in-core inode requiring allocation.
+ *      blkno       -  starting block of the current allocation.
+ *      nblocks     -  number of contiguous blocks within the current
+ *                     allocation.
+ *      addnblocks  -  number of blocks to add to the allocation.
+ *      results -      on successful return, set to the starting block number
+ *                     of the existing allocation if the existing allocation
+ *                     was extended in place or to a newly allocated contiguous
+ *                     range if the existing allocation could not be extended
+ *                     in place.
+ *
+ * RETURN VALUES:
+ *      0       - success
+ *      -ENOSPC - insufficient disk resources
+ *      -EIO    - i/o error
+ */
+int
+dbReAlloc(struct inode *ip,
+          s64 blkno, s64 nblocks, s64 addnblocks, s64 * results)
+{
+        int rc;
+        /* try to extend the allocation in place.
+         */
+        if ((rc = dbExtend(ip, blkno, nblocks, addnblocks)) == 0) {
+                *results = blkno;
+                return (0);
+        } else {
+                if (rc != -ENOSPC)
+                        return (rc);
+        }
+        /* could not extend the allocation in place, so allocate a
+         * new set of blocks for the entire request (i.e. try to get
+         * a range of contiguous blocks large enough to cover the
+         * existing allocation plus the additional blocks.)
+         */
+        return (dbAlloc
+                (ip, blkno + nblocks - 1, addnblocks + nblocks, results));
+}
+/*
+ * NAME:        dbExtend()
+ *
+ * FUNCTION:    attempt to extend a current allocation by a specified
+ *              number of blocks.
+ *
+ *              this routine attempts to satisfy the allocation request
+ *              by first trying to extend the existing allocation in
+ *              place by allocating the additional blocks as the blocks
+ *              immediately following the current allocation.
+ *
+ * PARAMETERS:
+ *      ip          -  pointer to in-core inode requiring allocation.
+ *      blkno       -  starting block of the current allocation.
+ *      nblocks     -  number of contiguous blocks within the current
+ *                     allocation.
+ *      addnblocks  -  number of blocks to add to the allocation.
+ *
+ * RETURN VALUES:
+ *      0       - success
+ *      -ENOSPC - insufficient disk resources
+ *      -EIO    - i/o error
+ */
+static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks)
+{
+        struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
+        s64 lblkno, lastblkno, extblkno;
+        uint rel_block;
+        struct metapage *mp;
+        struct dmap *dp;
+        int rc;
+        struct inode *ipbmap = sbi->ipbmap;
+        struct bmap *bmp;
+        /*
+         * We don't want a non-aligned extent to cross a page boundary
+         */
+        if (((rel_block = blkno & (sbi->nbperpage - 1))) &&
+            (rel_block + nblocks + addnblocks > sbi->nbperpage))
+                return -ENOSPC;
+        /* get the last block of the current allocation */
+        lastblkno = blkno + nblocks - 1;
+        /* determine the block number of the block following
+         * the existing allocation.
+         */
+        extblkno = lastblkno + 1;
+        IREAD_LOCK(ipbmap);
+        /* better be within the file system */
+        bmp = sbi->bmap;
+        if (lastblkno < 0 || lastblkno >= bmp->db_mapsize) {
+                IREAD_UNLOCK(ipbmap);
+                jfs_error(ip->i_sb,
+                          "dbExtend: the block is outside the filesystem");
+                return -EIO;
+        }
+        /* we'll attempt to extend the current allocation in place by
+         * allocating the additional blocks as the blocks immediately
+         * following the current allocation.  we only try to extend the
+         * current allocation in place if the number of additional blocks
+         * can fit into a dmap, the last block of the current allocation
+         * is not the last block of the file system, and the start of the
+         * inplace extension is not on an allocation group boundary.
+         */
+        if (addnblocks > BPERDMAP || extblkno >= bmp->db_mapsize ||
+            (extblkno & (bmp->db_agsize - 1)) == 0) {
+                IREAD_UNLOCK(ipbmap);
+                return -ENOSPC;
+        }
+        /* get the buffer for the dmap containing the first block
+         * of the extension.
+         */
+        lblkno = BLKTODMAP(extblkno, bmp->db_l2nbperpage);
+        mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
+        if (mp == NULL) {
+                IREAD_UNLOCK(ipbmap);
+                return -EIO;
+        }
+        DBALLOCCK(bmp->db_DBmap, bmp->db_mapsize, blkno, nblocks);
+        dp = (struct dmap *) mp->data;
+        /* try to allocate the blocks immediately following the
+         * current allocation.
+         */
+        rc = dbAllocNext(bmp, dp, extblkno, (int) addnblocks);
+        IREAD_UNLOCK(ipbmap);
+        /* were we successful ? */
+        if (rc == 0) {
+                DBALLOC(bmp->db_DBmap, bmp->db_mapsize, extblkno,
+                        addnblocks);
+                write_metapage(mp);
+        } else
+                /* we were not successful */
+                release_metapage(mp);
+        return (rc);
+}
+/*
+ * NAME:        dbAllocNext()
+ *
+ * FUNCTION:    attempt to allocate the blocks of the specified block
+ *              range within a dmap.
+ *
+ * PARAMETERS:
+ *      bmp     -  pointer to bmap descriptor
+ *      dp      -  pointer to dmap.
+ *      blkno   -  starting block number of the range.
+ *      nblocks -  number of contiguous free blocks of the range.
+ *
+ * RETURN VALUES:
+ *      0       - success
+ *      -ENOSPC - insufficient disk resources
+ *      -EIO    - i/o error
+ *
+ * serialization: IREAD_LOCK(ipbmap) held on entry/exit;
+ */
+static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno,
+                       int nblocks)
+{
+        int dbitno, word, rembits, nb, nwords, wbitno, nw;
+        int l2size;
+        s8 *leaf;
+        u32 mask;
+        if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) {
+                jfs_error(bmp->db_ipbmap->i_sb,
+                          "dbAllocNext: Corrupt dmap page");
+                return -EIO;
+        }
+        /* pick up a pointer to the leaves of the dmap tree.
+         */
+        leaf = dp->tree.stree + le32_to_cpu(dp->tree.leafidx);
+        /* determine the bit number and word within the dmap of the
+         * starting block.
+         */
+        dbitno = blkno & (BPERDMAP - 1);
+        word = dbitno >> L2DBWORD;
+        /* check if the specified block range is contained within
+         * this dmap.
+         */
+        if (dbitno + nblocks > BPERDMAP)
+                return -ENOSPC;
+        /* check if the starting leaf indicates that anything
+         * is free.
+         */
+        if (leaf[word] == NOFREE)
+                return -ENOSPC;
+        /* check the dmaps words corresponding to block range to see
+         * if the block range is free.  not all bits of the first and
+         * last words may be contained within the block range.  if this
+         * is the case, we'll work against those words (i.e. partial first
+         * and/or last) on an individual basis (a single pass) and examine
+         * the actual bits to determine if they are free.  a single pass
+         * will be used for all dmap words fully contained within the
+         * specified range.  within this pass, the leaves of the dmap
+         * tree will be examined to determine if the blocks are free. a
+         * single leaf may describe the free space of multiple dmap
+         * words, so we may visit only a subset of the actual leaves
+         * corresponding to the dmap words of the block range.
+         */
+        for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) {
+                /* determine the bit number within the word and
+                 * the number of bits within the word.
+                 */
+                wbitno = dbitno & (DBWORD - 1);
+                nb = min(rembits, DBWORD - wbitno);
+                /* check if only part of the word is to be examined.
+                 */
+                if (nb < DBWORD) {
+                        /* check if the bits are free.
+                         */
+                        mask = (ONES << (DBWORD - nb) >> wbitno);
+                        if ((mask & ~le32_to_cpu(dp->wmap[word])) != mask)
+                                return -ENOSPC;
+                        word += 1;
+                } else {
+                        /* one or more dmap words are fully contained
+                         * within the block range.  determine how many
+                         * words and how many bits.
+                         */
+                        nwords = rembits >> L2DBWORD;
+                        nb = nwords << L2DBWORD;
+                        /* now examine the appropriate leaves to determine
+                         * if the blocks are free.
+                         */
+                        while (nwords > 0) {
+                                /* does the leaf describe any free space ?
+                                 */
+                                if (leaf[word] < BUDMIN)
+                                        return -ENOSPC;
+                                /* determine the l2 number of bits provided
+                                 * by this leaf.
+                                 */
+                                l2size =
+                                    min((int)leaf[word], NLSTOL2BSZ(nwords));
+                                /* determine how many words were handled.
+                                 */
+                                nw = BUDSIZE(l2size, BUDMIN);
+                                nwords -= nw;
+                                word += nw;
+                        }
+                }
+        }
+        /* allocate the blocks.
+         */
+        return (dbAllocDmap(bmp, dp, blkno, nblocks));
+}
+/*
+ * NAME:        dbAllocNear()
+ *
+ * FUNCTION:    attempt to allocate a number of contiguous free blocks near
+ *              a specified block (hint) within a dmap.
+ *
+ *              starting with the dmap leaf that covers the hint, we'll
+ *              check the next four contiguous leaves for sufficient free
+ *              space.  if sufficient free space is found, we'll allocate
+ *              the desired free space.
+ *
+ * PARAMETERS:
+ *      bmp     -  pointer to bmap descriptor
+ *      dp      -  pointer to dmap.
+ *      blkno   -  block number to allocate near.
+ *      nblocks -  actual number of contiguous free blocks desired.
+ *      l2nb    -  log2 number of contiguous free blocks desired.
+ *      results -  on successful return, set to the starting block number
+ *                 of the newly allocated range.
+ *
+ * RETURN VALUES:
+ *      0       - success
+ *      -ENOSPC - insufficient disk resources
+ *      -EIO    - i/o error
+ *
+ * serialization: IREAD_LOCK(ipbmap) held on entry/exit;
+ */
+static int
+dbAllocNear(struct bmap * bmp,
+            struct dmap * dp, s64 blkno, int nblocks, int l2nb, s64 * results)
+{
+        int word, lword, rc;
+        s8 *leaf;
+        if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) {
+                jfs_error(bmp->db_ipbmap->i_sb,
+                          "dbAllocNear: Corrupt dmap page");
+                return -EIO;
+        }
+        leaf = dp->tree.stree + le32_to_cpu(dp->tree.leafidx);
+        /* determine the word within the dmap that holds the hint
+         * (i.e. blkno).  also, determine the last word in the dmap
+         * that we'll include in our examination.
+         */
+        word = (blkno & (BPERDMAP - 1)) >> L2DBWORD;
+        lword = min(word + 4, LPERDMAP);
+        /* examine the leaves for sufficient free space.
+         */
+        for (; word < lword; word++) {
+                /* does the leaf describe sufficient free space ?
+                 */
+                if (leaf[word] < l2nb)
+                        continue;
+                /* determine the block number within the file system
+                 * of the first block described by this dmap word.
+                 */
+                blkno = le64_to_cpu(dp->start) + (word << L2DBWORD);
+                /* if not all bits of the dmap word are free, get the
+                 * starting bit number within the dmap word of the required
+                 * string of free bits and adjust the block number with the
+                 * value.
+                 */
+                if (leaf[word] < BUDMIN)
+                        blkno +=
+                            dbFindBits(le32_to_cpu(dp->wmap[word]), l2nb);
+                /* allocate the blocks.
+                 */
+                if ((rc = dbAllocDmap(bmp, dp, blkno, nblocks)) == 0)
+                        *results = blkno;
+                return (rc);
+        }
+        return -ENOSPC;
+}
+/*
+ * NAME:        dbAllocAG()
+ *
+ * FUNCTION:    attempt to allocate the specified number of contiguous
+ *              free blocks within the specified allocation group.
+ *
+ *              unless the allocation group size is equal to the number
+ *              of blocks per dmap, the dmap control pages will be used to
+ *              find the required free space, if available.  we start the
+ *              search at the highest dmap control page level which
+ *              distinctly describes the allocation group's free space
+ *              (i.e. the highest level at which the allocation group's
+ *              free space is not mixed in with that of any other group).
+ *              in addition, we start the search within this level at a
+ *              height of the dmapctl dmtree at which the nodes distinctly
+ *              describe the allocation group's free space.  at this height,
+ *              the allocation group's free space may be represented by 1
+ *              or two sub-trees, depending on the allocation group size.
+ *              we search the top nodes of these subtrees left to right for
+ *              sufficient free space.  if sufficient free space is found,
+ *              the subtree is searched to find the leftmost leaf that 
+ *              has free space.  once we have made it to the leaf, we
+ *              move the search to the next lower level dmap control page
+ *              corresponding to this leaf.  we continue down the dmap control
+ *              pages until we find the dmap that contains or starts the
+ *              sufficient free space and we allocate at this dmap.
+ *
+ *              if the allocation group size is equal to the dmap size,
+ *              we'll start at the dmap corresponding to the allocation
+ *              group and attempt the allocation at this level.
+ *
+ *              the dmap control page search is also not performed if the
+ *              allocation group is completely free and we go to the first
+ *              dmap of the allocation group to do the allocation.  this is
+ *              done because the allocation group may be part (not the first
+ *              part) of a larger binary buddy system, causing the dmap
+ *              control pages to indicate no free space (NOFREE) within
+ *              the allocation group.
+ *
+ * PARAMETERS:
+ *      bmp     -  pointer to bmap descriptor
+ *      agno    - allocation group number.
+ *      nblocks -  actual number of contiguous free blocks desired.
+ *      l2nb    -  log2 number of contiguous free blocks desired.
+ *      results -  on successful return, set to the starting block number
+ *                 of the newly allocated range.
+ *
+ * RETURN VALUES:
+ *      0       - success
+ *      -ENOSPC - insufficient disk resources
+ *      -EIO    - i/o error
+ *
+ * note: IWRITE_LOCK(ipmap) held on entry/exit;
+ */
+static int
+dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
+{
+        struct metapage *mp;
+        struct dmapctl *dcp;
+        int rc, ti, i, k, m, n, agperlev;
+        s64 blkno, lblkno;
+        int budmin;
+        /* allocation request should not be for more than the
+         * allocation group size.
+         */
+        if (l2nb > bmp->db_agl2size) {
+                jfs_error(bmp->db_ipbmap->i_sb,
+                          "dbAllocAG: allocation request is larger than the "
+                          "allocation group size");
+                return -EIO;
+        }
+        /* determine the starting block number of the allocation
+         * group.
+         */
+        blkno = (s64) agno << bmp->db_agl2size;
+        /* check if the allocation group size is the minimum allocation
+         * group size or if the allocation group is completely free. if
+         * the allocation group size is the minimum size of BPERDMAP (i.e.
+         * 1 dmap), there is no need to search the dmap control page (below)
+         * that fully describes the allocation group since the allocation
+         * group is already fully described by a dmap.  in this case, we
+         * just call dbAllocCtl() to search the dmap tree and allocate the
+         * required space if available.  
+         *
+         * if the allocation group is completely free, dbAllocCtl() is
+         * also called to allocate the required space.  this is done for
+         * two reasons.  first, it makes no sense searching the dmap control
+         * pages for free space when we know that free space exists.  second,
+         * the dmap control pages may indicate that the allocation group
+         * has no free space if the allocation group is part (not the first
+         * part) of a larger binary buddy system.
+         */
+        if (bmp->db_agsize == BPERDMAP
+            || bmp->db_agfree[agno] == bmp->db_agsize) {
+                rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
+                if ((rc == -ENOSPC) &&
+                    (bmp->db_agfree[agno] == bmp->db_agsize)) {
+                        printk(KERN_ERR "blkno = %Lx, blocks = %Lx\n",
+                               (unsigned long long) blkno,
+                               (unsigned long long) nblocks);
+                        jfs_error(bmp->db_ipbmap->i_sb,
+                                  "dbAllocAG: dbAllocCtl failed in free AG");
+                }
+                return (rc);
+        }
+        /* the buffer for the dmap control page that fully describes the
+         * allocation group.
+         */
+        lblkno = BLKTOCTL(blkno, bmp->db_l2nbperpage, bmp->db_aglevel);
+        mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
+        if (mp == NULL)
+                return -EIO;
+        dcp = (struct dmapctl *) mp->data;
+        budmin = dcp->budmin;
+        if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
+                jfs_error(bmp->db_ipbmap->i_sb,
+                          "dbAllocAG: Corrupt dmapctl page");
+                release_metapage(mp);
+                return -EIO;
+        }
+        /* search the subtree(s) of the dmap control page that describes
+         * the allocation group, looking for sufficient free space.  to begin,
+         * determine how many allocation groups are represented in a dmap
+         * control page at the control page level (i.e. L0, L1, L2) that
+         * fully describes an allocation group. next, determine the starting
+         * tree index of this allocation group within the control page.
+         */
+        agperlev =
+            (1 << (L2LPERCTL - (bmp->db_agheigth << 1))) / bmp->db_agwidth;
+        ti = bmp->db_agstart + bmp->db_agwidth * (agno & (agperlev - 1));
+        /* dmap control page trees fan-out by 4 and a single allocation 
+         * group may be described by 1 or 2 subtrees within the ag level
+         * dmap control page, depending upon the ag size. examine the ag's
+         * subtrees for sufficient free space, starting with the leftmost
+         * subtree.
+         */
+        for (i = 0; i < bmp->db_agwidth; i++, ti++) {
+                /* is there sufficient free space ?
+                 */
+                if (l2nb > dcp->stree[ti])
+                        continue;
+                /* sufficient free space found in a subtree. now search down
+                 * the subtree to find the leftmost leaf that describes this
+                 * free space.
+                 */
+                for (k = bmp->db_agheigth; k > 0; k--) {
+                        for (n = 0, m = (ti << 2) + 1; n < 4; n++) {
+                                if (l2nb <= dcp->stree[m + n]) {
+                                        ti = m + n;
+                                        break;
+                                }
+                        }
+                        if (n == 4) {
+                                jfs_error(bmp->db_ipbmap->i_sb,
+                                          "dbAllocAG: failed descending stree");
+                                release_metapage(mp);
+                                return -EIO;
+                        }
+                }
+                /* determine the block number within the file system
+                 * that corresponds to this leaf.
+                 */
+                if (bmp->db_aglevel == 2)
+                        blkno = 0;
+                else if (bmp->db_aglevel == 1)
+                        blkno &= ~(MAXL1SIZE - 1);
+                else            /* bmp->db_aglevel == 0 */
+                        blkno &= ~(MAXL0SIZE - 1);
+                blkno +=
+                    ((s64) (ti - le32_to_cpu(dcp->leafidx))) << budmin;
+                /* release the buffer in preparation for going down
+                 * the next level of dmap control pages.
+                 */
+                release_metapage(mp);
+                /* check if we need to continue to search down the lower
+                 * level dmap control pages.  we need to if the number of
+                 * blocks required is less than maximum number of blocks
+                 * described at the next lower level.
+                 */
+                if (l2nb < budmin) {
+                        /* search the lower level dmap control pages to get
+                         * the starting block number of the the dmap that
+                         * contains or starts off the free space.
+                         */
+                        if ((rc =
+                             dbFindCtl(bmp, l2nb, bmp->db_aglevel - 1,
+                                       &blkno))) {
+                                if (rc == -ENOSPC) {
+                                        jfs_error(bmp->db_ipbmap->i_sb,
+                                                  "dbAllocAG: control page "
+                                                  "inconsistent");
+                                        return -EIO;
+                                }
+                                return (rc);
+                        }
+                }
+                /* allocate the blocks.
+                 */
+                rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
+                if (rc == -ENOSPC) {
+                        jfs_error(bmp->db_ipbmap->i_sb,
+                                  "dbAllocAG: unable to allocate blocks");
+                        rc = -EIO;
+                }
+                return (rc);
+        }
+        /* no space in the allocation group.  release the buffer and
+         * return -ENOSPC.
+         */
+        release_metapage(mp);
+        return -ENOSPC;
+}
+/*
+ * NAME:        dbAllocAny()
+ *
+ * FUNCTION:    attempt to allocate the specified number of contiguous
+ *              free blocks anywhere in the file system.
+ *
+ *              dbAllocAny() attempts to find the sufficient free space by
+ *              searching down the dmap control pages, starting with the
+ *              highest level (i.e. L0, L1, L2) control page.  if free space
+ *              large enough to satisfy the desired free space is found, the
+ *              desired free space is allocated.
+ *
+ * PARAMETERS:
+ *      bmp     -  pointer to bmap descriptor
+ *      nblocks  -  actual number of contiguous free blocks desired.
+ *      l2nb     -  log2 number of contiguous free blocks desired.
+ *      results -  on successful return, set to the starting block number
+ *                 of the newly allocated range.
+ *
+ * RETURN VALUES:
+ *      0       - success
+ *      -ENOSPC - insufficient disk resources
+ *      -EIO    - i/o error
+ *
+ * serialization: IWRITE_LOCK(ipbmap) held on entry/exit;
+ */
+static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results)
+{
+        int rc;
+        s64 blkno = 0;
+        /* starting with the top level dmap control page, search
+         * down the dmap control levels for sufficient free space.
+         * if free space is found, dbFindCtl() returns the starting
+         * block number of the dmap that contains or starts off the
+         * range of free space.
+         */
+        if ((rc = dbFindCtl(bmp, l2nb, bmp->db_maxlevel, &blkno)))
+                return (rc);
+        /* allocate the blocks.
+         */
+        rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
+        if (rc == -ENOSPC) {
+                jfs_error(bmp->db_ipbmap->i_sb,
+                          "dbAllocAny: unable to allocate blocks");
+                return -EIO;
+        }
+        return (rc);
+}
+/*
+ * NAME:        dbFindCtl()
+ *
+ * FUNCTION:    starting at a specified dmap control page level and block
+ *              number, search down the dmap control levels for a range of
+ *              contiguous free blocks large enough to satisfy an allocation
+ *              request for the specified number of free blocks.
+ *
+ *              if sufficient contiguous free blocks are found, this routine
+ *              returns the starting block number within a dmap page that
+ *              contains or starts a range of contiqious free blocks that
+ *              is sufficient in size.
+ *
+ * PARAMETERS:
+ *      bmp     -  pointer to bmap descriptor
+ *      level   -  starting dmap control page level.
+ *      l2nb    -  log2 number of contiguous free blocks desired.
+ *      *blkno  -  on entry, starting block number for conducting the search.
+ *                 on successful return, the first block within a dmap page
+ *                 that contains or starts a range of contiguous free blocks.
+ *
+ * RETURN VALUES:
+ *      0       - success
+ *      -ENOSPC - insufficient disk resources
+ *      -EIO    - i/o error
+ *
+ * serialization: IWRITE_LOCK(ipbmap) held on entry/exit;
+ */
+static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno)
+{
+        int rc, leafidx, lev;
+        s64 b, lblkno;
+        struct dmapctl *dcp;
+        int budmin;
+        struct metapage *mp;
+        /* starting at the specified dmap control page level and block
+         * number, search down the dmap control levels for the starting
+         * block number of a dmap page that contains or starts off 
+         * sufficient free blocks.
+         */
+        for (lev = level, b = *blkno; lev >= 0; lev--) {
+                /* get the buffer of the dmap control page for the block
+                 * number and level (i.e. L0, L1, L2).
+                 */
+                lblkno = BLKTOCTL(b, bmp->db_l2nbperpage, lev);
+                mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
+                if (mp == NULL)
+                        return -EIO;
+                dcp = (struct dmapctl *) mp->data;
+                budmin = dcp->budmin;
+                if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
+                        jfs_error(bmp->db_ipbmap->i_sb,
+                                  "dbFindCtl: Corrupt dmapctl page");
+                        release_metapage(mp);
+                        return -EIO;
+                }
+                /* search the tree within the dmap control page for
+                 * sufficent free space.  if sufficient free space is found,
+                 * dbFindLeaf() returns the index of the leaf at which
+                 * free space was found.
+                 */
+                rc = dbFindLeaf((dmtree_t *) dcp, l2nb, &leafidx);
+                /* release the buffer.
+                 */
+                release_metapage(mp);
+                /* space found ?
+                 */
+                if (rc) {
+                        if (lev != level) {
+                                jfs_error(bmp->db_ipbmap->i_sb,
+                                          "dbFindCtl: dmap inconsistent");
+                                return -EIO;
+                        }
+                        return -ENOSPC;
+                }
+                /* adjust the block number to reflect the location within
+                 * the dmap control page (i.e. the leaf) at which free 
+                 * space was found.
+                 */
+                b += (((s64) leafidx) << budmin);
+                /* we stop the search at this dmap control page level if
+                 * the number of blocks required is greater than or equal
+                 * to the maximum number of blocks described at the next
+                 * (lower) level.
+                 */
+                if (l2nb >= budmin)
+                        break;
+        }
+        *blkno = b;
+        return (0);
+}
+/*
+ * NAME:        dbAllocCtl()
+ *
+ * FUNCTION:    attempt to allocate a specified number of contiguous
+ *              blocks starting within a specific dmap.  
+ *              
+ *              this routine is called by higher level routines that search
+ *              the dmap control pages above the actual dmaps for contiguous
+ *              free space.  the result of successful searches by these
+ *              routines are the starting block numbers within dmaps, with
+ *              the dmaps themselves containing the desired contiguous free
+ *              space or starting a contiguous free space of desired size
+ *              that is made up of the blocks of one or more dmaps. these
+ *              calls should not fail due to insufficent resources.
+ *
+ *              this routine is called in some cases where it is not known
+ *              whether it will fail due to insufficient resources.  more
+ *              specifically, this occurs when allocating from an allocation
+ *              group whose size is equal to the number of blocks per dmap.
+ *              in this case, the dmap control pages are not examined prior
+ *              to calling this routine (to save pathlength) and the call
+ *              might fail.
+ *
+ *              for a request size that fits within a dmap, this routine relies
+ *              upon the dmap's dmtree to find the requested contiguous free
+ *              space.  for request sizes that are larger than a dmap, the
+ *              requested free space will start at the first block of the
+ *              first dmap (i.e. blkno).
+ *
+ * PARAMETERS:
+ *      bmp     -  pointer to bmap descriptor
+ *      nblocks  -  actual number of contiguous free blocks to allocate.
+ *      l2nb     -  log2 number of contiguous free blocks to allocate.
+ *      blkno    -  starting block number of the dmap to start the allocation
+ *                  from.
+ *      results -  on successful return, set to the starting block number
+ *                 of the newly allocated range.
+ *
+ * RETURN VALUES:
+ *      0       - success
+ *      -ENOSPC - insufficient disk resources
+ *      -EIO    - i/o error
+ *
+ * serialization: IWRITE_LOCK(ipbmap) held on entry/exit;
+ */
+static int
+dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
+{
+        int rc, nb;
+        s64 b, lblkno, n;
+        struct metapage *mp;
+        struct dmap *dp;
+        /* check if the allocation request is confined to a single dmap.
+         */
+        if (l2nb <= L2BPERDMAP) {
+                /* get the buffer for the dmap.
+                 */
+                lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
+                mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
+                if (mp == NULL)
+                        return -EIO;
+                dp = (struct dmap *) mp->data;
+                /* try to allocate the blocks.
+                 */
+                rc = dbAllocDmapLev(bmp, dp, (int) nblocks, l2nb, results);
+                if (rc == 0)
+                        mark_metapage_dirty(mp);
+                release_metapage(mp);
+                return (rc);
+        }
+        /* allocation request involving multiple dmaps. it must start on
+         * a dmap boundary.
+         */
+        assert((blkno & (BPERDMAP - 1)) == 0);
+        /* allocate the blocks dmap by dmap.
+         */
+        for (n = nblocks, b = blkno; n > 0; n -= nb, b += nb) {
+                /* get the buffer for the dmap.
+                 */
+                lblkno = BLKTODMAP(b, bmp->db_l2nbperpage);
+                mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
+                if (mp == NULL) {
+                        rc = -EIO;
+                        goto backout;
+                }
+                dp = (struct dmap *) mp->data;
+                /* the dmap better be all free.
+                 */
+                if (dp->tree.stree[ROOT] != L2BPERDMAP) {
+                        release_metapage(mp);
+                        jfs_error(bmp->db_ipbmap->i_sb,
+                                  "dbAllocCtl: the dmap is not all free");
+                        rc = -EIO;
+                        goto backout;
+                }
+                /* determine how many blocks to allocate from this dmap.
+                 */
+                nb = min(n, (s64)BPERDMAP);
+                /* allocate the blocks from the dmap.
+                 */
+                if ((rc = dbAllocDmap(bmp, dp, b, nb))) {
+                        release_metapage(mp);
+                        goto backout;
+                }
+                /* write the buffer.
+                 */
+                write_metapage(mp);
+        }
+        /* set the results (starting block number) and return.
+         */
+        *results = blkno;
+        return (0);
+        /* something failed in handling an allocation request involving
+         * multiple dmaps.  we'll try to clean up by backing out any
+         * allocation that has already happened for this request.  if
+         * we fail in backing out the allocation, we'll mark the file
+         * system to indicate that blocks have been leaked.
+         */
+      backout:
+        /* try to backout the allocations dmap by dmap.
+         */
+        for (n = nblocks - n, b = blkno; n > 0;
+             n -= BPERDMAP, b += BPERDMAP) {
+                /* get the buffer for this dmap.
+                 */
+                lblkno = BLKTODMAP(b, bmp->db_l2nbperpage);
+                mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
+                if (mp == NULL) {
+                        /* could not back out.  mark the file system
+                         * to indicate that we have leaked blocks.
+                         */
+                        jfs_error(bmp->db_ipbmap->i_sb,
+                                  "dbAllocCtl: I/O Error: Block Leakage.");
+                        continue;
+                }
+                dp = (struct dmap *) mp->data;
+                /* free the blocks is this dmap.
+                 */
+                if (dbFreeDmap(bmp, dp, b, BPERDMAP)) {
+                        /* could not back out.  mark the file system
+                         * to indicate that we have leaked blocks.
+                         */
+                        release_metapage(mp);
+                        jfs_error(bmp->db_ipbmap->i_sb,
+                                  "dbAllocCtl: Block Leakage.");
+                        continue;
+                }
+                /* write the buffer.
+                 */
+                write_metapage(mp);
+        }
+        return (rc);
+}
+/*
+ * NAME:        dbAllocDmapLev()
+ *
+ * FUNCTION:    attempt to allocate a specified number of contiguous blocks
+ *              from a specified dmap.
+ *              
+ *              this routine checks if the contiguous blocks are available.
+ *              if so, nblocks of blocks are allocated; otherwise, ENOSPC is
+ *              returned.
+ *
+ * PARAMETERS:
+ *      mp      -  pointer to bmap descriptor
+ *      dp      -  pointer to dmap to attempt to allocate blocks from. 
+ *      l2nb    -  log2 number of contiguous block desired.
+ *      nblocks -  actual number of contiguous block desired.
+ *      results -  on successful return, set to the starting block number
+ *                 of the newly allocated range.
+ *
+ * RETURN VALUES:
+ *      0       - success
+ *      -ENOSPC - insufficient disk resources
+ *      -EIO    - i/o error
+ *
+ * serialization: IREAD_LOCK(ipbmap), e.g., from dbAlloc(), or 
+ *      IWRITE_LOCK(ipbmap), e.g., dbAllocCtl(), held on entry/exit;
+ */
+static int
+dbAllocDmapLev(struct bmap * bmp,
+               struct dmap * dp, int nblocks, int l2nb, s64 * results)
+{
+        s64 blkno;
+        int leafidx, rc;
+        /* can't be more than a dmaps worth of blocks */
+        assert(l2nb <= L2BPERDMAP);
+        /* search the tree within the dmap page for sufficient
+         * free space.  if sufficient free space is found, dbFindLeaf()
+         * returns the index of the leaf at which free space was found.
+         */
+        if (dbFindLeaf((dmtree_t *) & dp->tree, l2nb, &leafidx))
+                return -ENOSPC;
+        /* determine the block number within the file system corresponding
+         * to the leaf at which free space was found.
+         */
+        blkno = le64_to_cpu(dp->start) + (leafidx << L2DBWORD);
+        /* if not all bits of the dmap word are free, get the starting
+         * bit number within the dmap word of the required string of free
+         * bits and adjust the block number with this value.
+         */
+        if (dp->tree.stree[leafidx + LEAFIND] < BUDMIN)
+                blkno += dbFindBits(le32_to_cpu(dp->wmap[leafidx]), l2nb);
+        /* allocate the blocks */
+        if ((rc = dbAllocDmap(bmp, dp, blkno, nblocks)) == 0)
+                *results = blkno;
+        return (rc);
+}
+/*
+ * NAME:        dbAllocDmap()
+ *
+ * FUNCTION:    adjust the disk allocation map to reflect the allocation
+ *              of a specified block range within a dmap.
+ *
+ *              this routine allocates the specified blocks from the dmap
+ *              through a call to dbAllocBits(). if the allocation of the
+ *              block range causes the maximum string of free blocks within
+ *              the dmap to change (i.e. the value of the root of the dmap's
+ *              dmtree), this routine will cause this change to be reflected
+ *              up through the appropriate levels of the dmap control pages
+ *              by a call to dbAdjCtl() for the L0 dmap control page that
+ *              covers this dmap.
+ *
+ * PARAMETERS:
+ *      bmp     -  pointer to bmap descriptor
+ *      dp      -  pointer to dmap to allocate the block range from.
+ *      blkno   -  starting block number of the block to be allocated.
+ *      nblocks -  number of blocks to be allocated.
+ *
+ * RETURN VALUES:
+ *      0       - success
+ *      -EIO    - i/o error
+ *
+ * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
+ */
+static int dbAllocDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
+                       int nblocks)
+{
+        s8 oldroot;
+        int rc;
+        /* save the current value of the root (i.e. maximum free string)
+         * of the dmap tree.
+         */
+        oldroot = dp->tree.stree[ROOT];
+        /* allocate the specified (blocks) bits */
+        dbAllocBits(bmp, dp, blkno, nblocks);
+        /* if the root has not changed, done. */
+        if (dp->tree.stree[ROOT] == oldroot)
+                return (0);
+        /* root changed. bubble the change up to the dmap control pages.
+         * if the adjustment of the upper level control pages fails,
+         * backout the bit allocation (thus making everything consistent).
+         */
+        if ((rc = dbAdjCtl(bmp, blkno, dp->tree.stree[ROOT], 1, 0)))
+                dbFreeBits(bmp, dp, blkno, nblocks);
+        return (rc);
+}
+/*
+ * NAME:        dbFreeDmap()
+ *
+ * FUNCTION:    adjust the disk allocation map to reflect the allocation
+ *              of a specified block range within a dmap.
+ *
+ *              this routine frees the specified blocks from the dmap through
+ *              a call to dbFreeBits(). if the deallocation of the block range
+ *              causes the maximum string of free blocks within the dmap to
+ *              change (i.e. the value of the root of the dmap's dmtree), this
+ *              routine will cause this change to be reflected up through the
+ *              appropriate levels of the dmap control pages by a call to
+ *              dbAdjCtl() for the L0 dmap control page that covers this dmap.
+ *
+ * PARAMETERS:
+ *      bmp     -  pointer to bmap descriptor
+ *      dp      -  pointer to dmap to free the block range from.
+ *      blkno   -  starting block number of the block to be freed.
+ *      nblocks -  number of blocks to be freed.
+ *
+ * RETURN VALUES:
+ *      0       - success
+ *      -EIO    - i/o error
+ *
+ * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
+ */
+static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
+                      int nblocks)
+{
+        s8 oldroot;
+        int rc, word;
+        /* save the current value of the root (i.e. maximum free string)
+         * of the dmap tree.
+         */
+        oldroot = dp->tree.stree[ROOT];
+        /* free the specified (blocks) bits */
+        dbFreeBits(bmp, dp, blkno, nblocks);
+        /* if the root has not changed, done. */
+        if (dp->tree.stree[ROOT] == oldroot)
+                return (0);
+        /* root changed. bubble the change up to the dmap control pages.
+         * if the adjustment of the upper level control pages fails,
+         * backout the deallocation. 
+         */
+        if ((rc = dbAdjCtl(bmp, blkno, dp->tree.stree[ROOT], 0, 0))) {
+                word = (blkno & (BPERDMAP - 1)) >> L2DBWORD;
+                /* as part of backing out the deallocation, we will have
+                 * to back split the dmap tree if the deallocation caused
+                 * the freed blocks to become part of a larger binary buddy
+                 * system.
+                 */
+                if (dp->tree.stree[word] == NOFREE)
+                        dbBackSplit((dmtree_t *) & dp->tree, word);
+                dbAllocBits(bmp, dp, blkno, nblocks);
+        }
+        return (rc);
+}
+/*
+ * NAME:        dbAllocBits()
+ *
+ * FUNCTION:    allocate a specified block range from a dmap.
+ *
+ *              this routine updates the dmap to reflect the working
+ *              state allocation of the specified block range. it directly
+ *              updates the bits of the working map and causes the adjustment
+ *              of the binary buddy system described by the dmap's dmtree
+ *              leaves to reflect the bits allocated.  it also causes the
+ *              dmap's dmtree, as a whole, to reflect the allocated range.
+ *
+ * PARAMETERS:
+ *      bmp     -  pointer to bmap descriptor
+ *      dp      -  pointer to dmap to allocate bits from.
+ *      blkno   -  starting block number of the bits to be allocated.
+ *      nblocks -  number of bits to be allocated.
+ *
+ * RETURN VALUES: none
+ *
+ * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
+ */
+static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
+                        int nblocks)
+{
+        int dbitno, word, rembits, nb, nwords, wbitno, nw, agno;
+        dmtree_t *tp = (dmtree_t *) & dp->tree;
+        int size;
+        s8 *leaf;
+        /* pick up a pointer to the leaves of the dmap tree */
+        leaf = dp->tree.stree + LEAFIND;
+        /* determine the bit number and word within the dmap of the
+         * starting block.
+         */
+        dbitno = blkno & (BPERDMAP - 1);
+        word = dbitno >> L2DBWORD;
+        /* block range better be within the dmap */
+        assert(dbitno + nblocks <= BPERDMAP);
+        /* allocate the bits of the dmap's words corresponding to the block
+         * range. not all bits of the first and last words may be contained
+         * within the block range.  if this is the case, we'll work against
+         * those words (i.e. partial first and/or last) on an individual basis
+         * (a single pass), allocating the bits of interest by hand and
+         * updating the leaf corresponding to the dmap word. a single pass
+         * will be used for all dmap words fully contained within the
+         * specified range.  within this pass, the bits of all fully contained
+         * dmap words will be marked as free in a single shot and the leaves
+         * will be updated. a single leaf may describe the free space of
+         * multiple dmap words, so we may update only a subset of the actual
+         * leaves corresponding to the dmap words of the block range.
+         */
+        for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) {
+                /* determine the bit number within the word and
+                 * the number of bits within the word.
+                 */
+                wbitno = dbitno & (DBWORD - 1);
+                nb = min(rembits, DBWORD - wbitno);
+                /* check if only part of a word is to be allocated.
+                 */
+                if (nb < DBWORD) {
+                        /* allocate (set to 1) the appropriate bits within
+                         * this dmap word.
+                         */
+                        dp->wmap[word] |= cpu_to_le32(ONES << (DBWORD - nb)
+                                                      >> wbitno);
+                        /* update the leaf for this dmap word. in addition
+                         * to setting the leaf value to the binary buddy max
+                         * of the updated dmap word, dbSplit() will split
+                         * the binary system of the leaves if need be.
+                         */
+                        dbSplit(tp, word, BUDMIN,
+                                dbMaxBud((u8 *) & dp->wmap[word]));
+                        word += 1;
+                } else {
+                        /* one or more dmap words are fully contained
+                         * within the block range.  determine how many
+                         * words and allocate (set to 1) the bits of these
+                         * words.
+                         */
+                        nwords = rembits >> L2DBWORD;
+                        memset(&dp->wmap[word], (int) ONES, nwords * 4);
+                        /* determine how many bits.
+                         */
+                        nb = nwords << L2DBWORD;
+                        /* now update the appropriate leaves to reflect
+                         * the allocated words.
+                         */
+                        for (; nwords > 0; nwords -= nw) {
+                                if (leaf[word] < BUDMIN) {
+                                        jfs_error(bmp->db_ipbmap->i_sb,
+                                                  "dbAllocBits: leaf page "
+                                                  "corrupt");
+                                        break;
+                                }
+                                /* determine what the leaf value should be
+                                 * updated to as the minimum of the l2 number
+                                 * of bits being allocated and the l2 number
+                                 * of bits currently described by this leaf.
+                                 */
+                                size = min((int)leaf[word], NLSTOL2BSZ(nwords));
+                                /* update the leaf to reflect the allocation.
+                                 * in addition to setting the leaf value to
+                                 * NOFREE, dbSplit() will split the binary
+                                 * system of the leaves to reflect the current
+                                 * allocation (size).
+                                 */
+                                dbSplit(tp, word, size, NOFREE);
+                                /* get the number of dmap words handled */
+                                nw = BUDSIZE(size, BUDMIN);
+                                word += nw;
+                        }
+                }
+        }
+        /* update the free count for this dmap */
+        dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) - nblocks);
+        BMAP_LOCK(bmp);
+        /* if this allocation group is completely free,
+         * update the maximum allocation group number if this allocation
+         * group is the new max.
+         */
+        agno = blkno >> bmp->db_agl2size;
+        if (agno > bmp->db_maxag)
+                bmp->db_maxag = agno;
+        /* update the free count for the allocation group and map */
+        bmp->db_agfree[agno] -= nblocks;
+        bmp->db_nfree -= nblocks;
+        BMAP_UNLOCK(bmp);
+}
+/*
+ * NAME:        dbFreeBits()
+ *
+ * FUNCTION:    free a specified block range from a dmap.
+ *
+ *              this routine updates the dmap to reflect the working
+ *              state allocation of the specified block range. it directly
+ *              updates the bits of the working map and causes the adjustment
+ *              of the binary buddy system described by the dmap's dmtree
+ *              leaves to reflect the bits freed.  it also causes the dmap's
+ *              dmtree, as a whole, to reflect the deallocated range.
+ *
+ * PARAMETERS:
+ *      bmp     -  pointer to bmap descriptor
+ *      dp      -  pointer to dmap to free bits from.
+ *      blkno   -  starting block number of the bits to be freed.
+ *      nblocks -  number of bits to be freed.
+ *
+ * RETURN VALUES: none
+ *
+ * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
+ */
+static void dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
+                       int nblocks)
+{
+        int dbitno, word, rembits, nb, nwords, wbitno, nw, agno;
+        dmtree_t *tp = (dmtree_t *) & dp->tree;
+        int size;
+        /* determine the bit number and word within the dmap of the
+         * starting block.
+         */
+        dbitno = blkno & (BPERDMAP - 1);
+        word = dbitno >> L2DBWORD;
+        /* block range better be within the dmap.
+         */
+        assert(dbitno + nblocks <= BPERDMAP);
+        /* free the bits of the dmaps words corresponding to the block range.
+         * not all bits of the first and last words may be contained within
+         * the block range.  if this is the case, we'll work against those
+         * words (i.e. partial first and/or last) on an individual basis
+         * (a single pass), freeing the bits of interest by hand and updating
+         * the leaf corresponding to the dmap word. a single pass will be used
+         * for all dmap words fully contained within the specified range.  
+         * within this pass, the bits of all fully contained dmap words will
+         * be marked as free in a single shot and the leaves will be updated. a
+         * single leaf may describe the free space of multiple dmap words,
+         * so we may update only a subset of the actual leaves corresponding
+         * to the dmap words of the block range.
+         *
+         * dbJoin() is used to update leaf values and will join the binary
+         * buddy system of the leaves if the new leaf values indicate this
+         * should be done.
+         */
+        for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) {
+                /* determine the bit number within the word and
+                 * the number of bits within the word.
+                 */
+                wbitno = dbitno & (DBWORD - 1);
+                nb = min(rembits, DBWORD - wbitno);
+                /* check if only part of a word is to be freed.
+                 */
+                if (nb < DBWORD) {
+                        /* free (zero) the appropriate bits within this
+                         * dmap word. 
+                         */
+                        dp->wmap[word] &=
+                            cpu_to_le32(~(ONES << (DBWORD - nb)
+                                          >> wbitno));
+                        /* update the leaf for this dmap word.
+                         */
+                        dbJoin(tp, word,
+                               dbMaxBud((u8 *) & dp->wmap[word]));
+                        word += 1;
+                } else {
+                        /* one or more dmap words are fully contained
+                         * within the block range.  determine how many
+                         * words and free (zero) the bits of these words.
+                         */
+                        nwords = rembits >> L2DBWORD;
+                        memset(&dp->wmap[word], 0, nwords * 4);
+                        /* determine how many bits.
+                         */
+                        nb = nwords << L2DBWORD;
+                        /* now update the appropriate leaves to reflect
+                         * the freed words.
+                         */
+                        for (; nwords > 0; nwords -= nw) {
+                                /* determine what the leaf value should be
+                                 * updated to as the minimum of the l2 number
+                                 * of bits being freed and the l2 (max) number
+                                 * of bits that can be described by this leaf.
+                                 */
+                                size =
+                                    min(LITOL2BSZ
+                                        (word, L2LPERDMAP, BUDMIN),
+                                        NLSTOL2BSZ(nwords));
+                                /* update the leaf.
+                                 */
+                                dbJoin(tp, word, size);
+                                /* get the number of dmap words handled.
+                                 */
+                                nw = BUDSIZE(size, BUDMIN);
+                                word += nw;
+                        }
+                }
+        }
+        /* update the free count for this dmap.
+         */
+        dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) + nblocks);
+        BMAP_LOCK(bmp);
+        /* update the free count for the allocation group and 
+         * map.
+         */
+        agno = blkno >> bmp->db_agl2size;
+        bmp->db_nfree += nblocks;
+        bmp->db_agfree[agno] += nblocks;
+        /* check if this allocation group is not completely free and
+         * if it is currently the maximum (rightmost) allocation group.
+         * if so, establish the new maximum allocation group number by
+         * searching left for the first allocation group with allocation.
+         */
+        if ((bmp->db_agfree[agno] == bmp->db_agsize && agno == bmp->db_maxag) ||
+            (agno == bmp->db_numag - 1 &&
+             bmp->db_agfree[agno] == (bmp-> db_mapsize & (BPERDMAP - 1)))) {
+                while (bmp->db_maxag > 0) {
+                        bmp->db_maxag -= 1;
+                        if (bmp->db_agfree[bmp->db_maxag] !=
+                            bmp->db_agsize)
+                                break;
+                }
+                /* re-establish the allocation group preference if the
+                 * current preference is right of the maximum allocation
+                 * group.
+                 */
+                if (bmp->db_agpref > bmp->db_maxag)
+                        bmp->db_agpref = bmp->db_maxag;
+        }
+        BMAP_UNLOCK(bmp);
+}
+/*
+ * NAME:        dbAdjCtl()
+ *
+ * FUNCTION:    adjust a dmap control page at a specified level to reflect
+ *              the change in a lower level dmap or dmap control page's
+ *              maximum string of free blocks (i.e. a change in the root
+ *              of the lower level object's dmtree) due to the allocation
+ *              or deallocation of a range of blocks with a single dmap.
+ *
+ *              on entry, this routine is provided with the new value of
+ *              the lower level dmap or dmap control page root and the
+ *              starting block number of the block range whose allocation
+ *              or deallocation resulted in the root change.  this range
+ *              is respresented by a single leaf of the current dmapctl
+ *              and the leaf will be updated with this value, possibly
+ *              causing a binary buddy system within the leaves to be 
+ *              split or joined.  the update may also cause the dmapctl's
+ *              dmtree to be updated.
+ *
+ *              if the adjustment of the dmap control page, itself, causes its
+ *              root to change, this change will be bubbled up to the next dmap
+ *              control level by a recursive call to this routine, specifying
+ *              the new root value and the next dmap control page level to
+ *              be adjusted.
+ * PARAMETERS:
+ *      bmp     -  pointer to bmap descriptor
+ *      blkno   -  the first block of a block range within a dmap.  it is
+ *                 the allocation or deallocation of this block range that
+ *                 requires the dmap control page to be adjusted.
+ *      newval  -  the new value of the lower level dmap or dmap control
+ *                 page root.
+ *      alloc   -  TRUE if adjustment is due to an allocation.
+ *      level   -  current level of dmap control page (i.e. L0, L1, L2) to
+ *                 be adjusted.
+ *
+ * RETURN VALUES:
+ *      0       - success
+ *      -EIO    - i/o error
+ *
+ * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
+ */
+static int
+dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
+{
+        struct metapage *mp;
+        s8 oldroot;
+        int oldval;
+        s64 lblkno;
+        struct dmapctl *dcp;
+        int rc, leafno, ti;
+        /* get the buffer for the dmap control page for the specified
+         * block number and control page level.
+         */
+        lblkno = BLKTOCTL(blkno, bmp->db_l2nbperpage, level);
+        mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
+        if (mp == NULL)
+                return -EIO;
+        dcp = (struct dmapctl *) mp->data;
+        if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
+                jfs_error(bmp->db_ipbmap->i_sb,
+                          "dbAdjCtl: Corrupt dmapctl page");
+                release_metapage(mp);
+                return -EIO;
+        }
+        /* determine the leaf number corresponding to the block and
+         * the index within the dmap control tree.
+         */
+        leafno = BLKTOCTLLEAF(blkno, dcp->budmin);
+        ti = leafno + le32_to_cpu(dcp->leafidx);
+        /* save the current leaf value and the current root level (i.e.
+         * maximum l2 free string described by this dmapctl).
+         */
+        oldval = dcp->stree[ti];
+        oldroot = dcp->stree[ROOT];
+        /* check if this is a control page update for an allocation.
+         * if so, update the leaf to reflect the new leaf value using
+         * dbSplit(); otherwise (deallocation), use dbJoin() to udpate
+         * the leaf with the new value.  in addition to updating the
+         * leaf, dbSplit() will also split the binary buddy system of
+         * the leaves, if required, and bubble new values within the
+         * dmapctl tree, if required.  similarly, dbJoin() will join
+         * the binary buddy system of leaves and bubble new values up
+         * the dmapctl tree as required by the new leaf value.
+         */
+        if (alloc) {
+                /* check if we are in the middle of a binary buddy
+                 * system.  this happens when we are performing the
+                 * first allocation out of an allocation group that
+                 * is part (not the first part) of a larger binary
+                 * buddy system.  if we are in the middle, back split
+                 * the system prior to calling dbSplit() which assumes
+                 * that it is at the front of a binary buddy system.
+                 */
+                if (oldval == NOFREE) {
+                        dbBackSplit((dmtree_t *) dcp, leafno);
+                        oldval = dcp->stree[ti];
+                }
+                dbSplit((dmtree_t *) dcp, leafno, dcp->budmin, newval);
+        } else {
+                dbJoin((dmtree_t *) dcp, leafno, newval);
+        }
+        /* check if the root of the current dmap control page changed due
+         * to the update and if the current dmap control page is not at
+         * the current top level (i.e. L0, L1, L2) of the map.  if so (i.e.
+         * root changed and this is not the top level), call this routine
+         * again (recursion) for the next higher level of the mapping to
+         * reflect the change in root for the current dmap control page.
+         */
+        if (dcp->stree[ROOT] != oldroot) {
+                /* are we below the top level of the map.  if so,
+                 * bubble the root up to the next higher level.
+                 */
+                if (level < bmp->db_maxlevel) {
+                        /* bubble up the new root of this dmap control page to
+                         * the next level.
+                         */
+                        if ((rc =
+                             dbAdjCtl(bmp, blkno, dcp->stree[ROOT], alloc,
+                                      level + 1))) {
+                                /* something went wrong in bubbling up the new
+                                 * root value, so backout the changes to the
+                                 * current dmap control page.
+                                 */
+                                if (alloc) {
+                                        dbJoin((dmtree_t *) dcp, leafno,
+                                               oldval);
+                                } else {
+                                        /* the dbJoin() above might have
+                                         * caused a larger binary buddy system
+                                         * to form and we may now be in the
+                                         * middle of it.  if this is the case,
+                                         * back split the buddies.
+                                         */
+                                        if (dcp->stree[ti] == NOFREE)
+                                                dbBackSplit((dmtree_t *)
+                                                            dcp, leafno);
+                                        dbSplit((dmtree_t *) dcp, leafno,
+                                                dcp->budmin, oldval);
+                                }
+                                /* release the buffer and return the error.
+                                 */
+                                release_metapage(mp);
+                                return (rc);
+                        }
+                } else {
+                        /* we're at the top level of the map. update
+                         * the bmap control page to reflect the size
+                         * of the maximum free buddy system.
+                         */
+                        assert(level == bmp->db_maxlevel);
+                        if (bmp->db_maxfreebud != oldroot) {
+                                jfs_error(bmp->db_ipbmap->i_sb,
+                                          "dbAdjCtl: the maximum free buddy is "
+                                          "not the old root");
+                        }
+                        bmp->db_maxfreebud = dcp->stree[ROOT];
+                }
+        }
+        /* write the buffer.
+         */
+        write_metapage(mp);
+        return (0);
+}
+/*
+ * NAME:        dbSplit()
+ *
+ * FUNCTION:    update the leaf of a dmtree with a new value, splitting
+ *              the leaf from the binary buddy system of the dmtree's
+ *              leaves, as required.
+ *
+ * PARAMETERS:
+ *      tp      - pointer to the tree containing the leaf.
+ *      leafno  - the number of the leaf to be updated.
+ *      splitsz - the size the binary buddy system starting at the leaf
+ *                must be split to, specified as the log2 number of blocks.
+ *      newval  - the new value for the leaf.
+ *
+ * RETURN VALUES: none
+ *
+ * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
+ */
+static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval)
+{
+        int budsz;
+        int cursz;
+        s8 *leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx);
+        /* check if the leaf needs to be split.
+         */
+        if (leaf[leafno] > tp->dmt_budmin) {
+                /* the split occurs by cutting the buddy system in half
+                 * at the specified leaf until we reach the specified
+                 * size.  pick up the starting split size (current size
+                 * - 1 in l2) and the corresponding buddy size.
+                 */
+                cursz = leaf[leafno] - 1;
+                budsz = BUDSIZE(cursz, tp->dmt_budmin);
+                /* split until we reach the specified size.
+                 */
+                while (cursz >= splitsz) {
+                        /* update the buddy's leaf with its new value.
+                         */
+                        dbAdjTree(tp, leafno ^ budsz, cursz);
+                        /* on to the next size and buddy.
+                         */
+                        cursz -= 1;
+                        budsz >>= 1;
+                }
+        }
+        /* adjust the dmap tree to reflect the specified leaf's new 
+         * value.
+         */
+        dbAdjTree(tp, leafno, newval);
+}
+/*
+ * NAME:        dbBackSplit()
+ *
+ * FUNCTION:    back split the binary buddy system of dmtree leaves
+ *              that hold a specified leaf until the specified leaf
+ *              starts its own binary buddy system.
+ *
+ *              the allocators typically perform allocations at the start
+ *              of binary buddy systems and dbSplit() is used to accomplish
+ *              any required splits.  in some cases, however, allocation
+ *              may occur in the middle of a binary system and requires a
+ *              back split, with the split proceeding out from the middle of
+ *              the system (less efficient) rather than the start of the
+ *              system (more efficient).  the cases in which a back split
+ *              is required are rare and are limited to the first allocation
+ *              within an allocation group which is a part (not first part)
+ *              of a larger binary buddy system and a few exception cases
+ *              in which a previous join operation must be backed out.
+ *
+ * PARAMETERS:
+ *      tp      - pointer to the tree containing the leaf.
+ *      leafno  - the number of the leaf to be updated.
+ *
+ * RETURN VALUES: none
+ *
+ * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
+ */
+static void dbBackSplit(dmtree_t * tp, int leafno)
+{
+        int budsz, bud, w, bsz, size;
+        int cursz;
+        s8 *leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx);
+        /* leaf should be part (not first part) of a binary
+         * buddy system.
+         */
+        assert(leaf[leafno] == NOFREE);
+        /* the back split is accomplished by iteratively finding the leaf
+         * that starts the buddy system that contains the specified leaf and
+         * splitting that system in two.  this iteration continues until
+         * the specified leaf becomes the start of a buddy system. 
+         *
+         * determine maximum possible l2 size for the specified leaf.
+         */
+        size =
+            LITOL2BSZ(leafno, le32_to_cpu(tp->dmt_l2nleafs),
+                      tp->dmt_budmin);
+        /* determine the number of leaves covered by this size.  this
+         * is the buddy size that we will start with as we search for
+         * the buddy system that contains the specified leaf.
+         */
+        budsz = BUDSIZE(size, tp->dmt_budmin);
+        /* back split.
+         */
+        while (leaf[leafno] == NOFREE) {
+                /* find the leftmost buddy leaf.
+                 */
+                for (w = leafno, bsz = budsz;; bsz <<= 1,
+                     w = (w < bud) ? w : bud) {
+                        assert(bsz < le32_to_cpu(tp->dmt_nleafs));
+                        /* determine the buddy.
+                         */
+                        bud = w ^ bsz;
+                        /* check if this buddy is the start of the system.
+                         */
+                        if (leaf[bud] != NOFREE) {
+                                /* split the leaf at the start of the
+                                 * system in two.
+                                 */
+                                cursz = leaf[bud] - 1;
+                                dbSplit(tp, bud, cursz, cursz);
+                                break;
+                        }
+                }
+        }
+        assert(leaf[leafno] == size);
+}
+/*
+ * NAME:        dbJoin()
+ *
+ * FUNCTION:    update the leaf of a dmtree with a new value, joining
+ *              the leaf with other leaves of the dmtree into a multi-leaf
+ *              binary buddy system, as required.
+ *
+ * PARAMETERS:
+ *      tp      - pointer to the tree containing the leaf.
+ *      leafno  - the number of the leaf to be updated.
+ *      newval  - the new value for the leaf.
+ *
+ * RETURN VALUES: none
+ */
+static void dbJoin(dmtree_t * tp, int leafno, int newval)
+{
+        int budsz, buddy;
+        s8 *leaf;
+        /* can the new leaf value require a join with other leaves ?
+         */
+        if (newval >= tp->dmt_budmin) {
+                /* pickup a pointer to the leaves of the tree.
+                 */
+                leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx);
+                /* try to join the specified leaf into a large binary
+                 * buddy system.  the join proceeds by attempting to join
+                 * the specified leafno with its buddy (leaf) at new value.
+                 * if the join occurs, we attempt to join the left leaf
+                 * of the joined buddies with its buddy at new value + 1.
+                 * we continue to join until we find a buddy that cannot be
+                 * joined (does not have a value equal to the size of the
+                 * last join) or until all leaves have been joined into a
+                 * single system.
+                 *
+                 * get the buddy size (number of words covered) of
+                 * the new value.
+                 */
+                budsz = BUDSIZE(newval, tp->dmt_budmin);
+                /* try to join.
+                 */
+                while (budsz < le32_to_cpu(tp->dmt_nleafs)) {
+                        /* get the buddy leaf.
+                         */
+                        buddy = leafno ^ budsz;
+                        /* if the leaf's new value is greater than its
+                         * buddy's value, we join no more.
+                         */
+                        if (newval > leaf[buddy])
+                                break;
+                        assert(newval == leaf[buddy]);
+                        /* check which (leafno or buddy) is the left buddy.
+                         * the left buddy gets to claim the blocks resulting
+                         * from the join while the right gets to claim none.
+                         * the left buddy is also eligable to participate in
+                         * a join at the next higher level while the right
+                         * is not.
+                         *
+                         */
+                        if (leafno < buddy) {
+                                /* leafno is the left buddy.
+                                 */
+                                dbAdjTree(tp, buddy, NOFREE);
+                        } else {
+                                /* buddy is the left buddy and becomes
+                                 * leafno.
+                                 */
+                                dbAdjTree(tp, leafno, NOFREE);
+                                leafno = buddy;
+                        }
+                        /* on to try the next join.
+                         */
+                        newval += 1;
+                        budsz <<= 1;
+                }
+        }
+        /* update the leaf value.
+         */
+        dbAdjTree(tp, leafno, newval);
+}
+/*
+ * NAME:        dbAdjTree()
+ *
+ * FUNCTION:    update a leaf of a dmtree with a new value, adjusting
+ *              the dmtree, as required, to reflect the new leaf value.
+ *              the combination of any buddies must already be done before
+ *              this is called.
+ *
+ * PARAMETERS:
+ *      tp      - pointer to the tree to be adjusted.
+ *      leafno  - the number of the leaf to be updated.
+ *      newval  - the new value for the leaf.
+ *
+ * RETURN VALUES: none
+ */
+static void dbAdjTree(dmtree_t * tp, int leafno, int newval)
+{
+        int lp, pp, k;
+        int max;
+        /* pick up the index of the leaf for this leafno.
+         */
+        lp = leafno + le32_to_cpu(tp->dmt_leafidx);
+        /* is the current value the same as the old value ?  if so,
+         * there is nothing to do.
+         */
+        if (tp->dmt_stree[lp] == newval)
+                return;
+        /* set the new value.
+         */
+        tp->dmt_stree[lp] = newval;
+        /* bubble the new value up the tree as required.
+         */
+        for (k = 0; k < le32_to_cpu(tp->dmt_height); k++) {
+                /* get the index of the first leaf of the 4 leaf
+                 * group containing the specified leaf (leafno).
+                 */
+                lp = ((lp - 1) & ~0x03) + 1;
+                /* get the index of the parent of this 4 leaf group.
+                 */
+                pp = (lp - 1) >> 2;
+                /* determine the maximum of the 4 leaves.
+                 */
+                max = TREEMAX(&tp->dmt_stree[lp]);
+                /* if the maximum of the 4 is the same as the
+                 * parent's value, we're done.
+                 */
+                if (tp->dmt_stree[pp] == max)
+                        break;
+                /* parent gets new value.
+                 */
+                tp->dmt_stree[pp] = max;
+                /* parent becomes leaf for next go-round.
+                 */
+                lp = pp;
+        }
+}
+/*
+ * NAME:        dbFindLeaf()
+ *
+ * FUNCTION:    search a dmtree_t for sufficient free blocks, returning
+ *              the index of a leaf describing the free blocks if 
+ *              sufficient free blocks are found.
+ *
+ *              the search starts at the top of the dmtree_t tree and
+ *              proceeds down the tree to the leftmost leaf with sufficient
+ *              free space.
+ *
+ * PARAMETERS:
+ *      tp      - pointer to the tree to be searched.
+ *      l2nb    - log2 number of free blocks to search for.
+ *      leafidx - return pointer to be set to the index of the leaf
+ *                describing at least l2nb free blocks if sufficient
+ *                free blocks are found.
+ *
+ * RETURN VALUES:
+ *      0       - success
+ *      -ENOSPC - insufficient free blocks. 
+ */
+static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx)
+{
+        int ti, n = 0, k, x = 0;
+        /* first check the root of the tree to see if there is
+         * sufficient free space.
+         */
+        if (l2nb > tp->dmt_stree[ROOT])
+                return -ENOSPC;
+        /* sufficient free space available. now search down the tree
+         * starting at the next level for the leftmost leaf that
+         * describes sufficient free space.
+         */
+        for (k = le32_to_cpu(tp->dmt_height), ti = 1;
+             k > 0; k--, ti = ((ti + n) << 2) + 1) {
+                /* search the four nodes at this level, starting from
+                 * the left.
+                 */
+                for (x = ti, n = 0; n < 4; n++) {
+                        /* sufficient free space found.  move to the next
+                         * level (or quit if this is the last level).
+                         */
+                        if (l2nb <= tp->dmt_stree[x + n])
+                                break;
+                }
+                /* better have found something since the higher
+                 * levels of the tree said it was here.
+                 */
+                assert(n < 4);
+        }
+        /* set the return to the leftmost leaf describing sufficient
+         * free space.
+         */
+        *leafidx = x + n - le32_to_cpu(tp->dmt_leafidx);
+        return (0);
+}
+/*
+ * NAME:        dbFindBits()
+ *
+ * FUNCTION:    find a specified number of binary buddy free bits within a
+ *              dmap bitmap word value.
+ *
+ *              this routine searches the bitmap value for (1 << l2nb) free
+ *              bits at (1 << l2nb) alignments within the value.
+ *
+ * PARAMETERS:
+ *      word    -  dmap bitmap word value.
+ *      l2nb    -  number of free bits specified as a log2 number.
+ *
+ * RETURN VALUES:
+ *      starting bit number of free bits.
+ */
+static int dbFindBits(u32 word, int l2nb)
+{
+        int bitno, nb;
+        u32 mask;
+        /* get the number of bits.
+         */
+        nb = 1 << l2nb;
+        assert(nb <= DBWORD);
+        /* complement the word so we can use a mask (i.e. 0s represent
+         * free bits) and compute the mask.
+         */
+        word = ~word;
+        mask = ONES << (DBWORD - nb);
+        /* scan the word for nb free bits at nb alignments.
+         */
+        for (bitno = 0; mask != 0; bitno += nb, mask >>= nb) {
+                if ((mask & word) == mask)
+                        break;
+        }
+        ASSERT(bitno < 32);
+        /* return the bit number.
+         */
+        return (bitno);
+}
+/*
+ * NAME:        dbMaxBud(u8 *cp)
+ *
+ * FUNCTION:    determine the largest binary buddy string of free
+ *              bits within 32-bits of the map.
+ *
+ * PARAMETERS:
+ *      cp      -  pointer to the 32-bit value.
+ *
+ * RETURN VALUES:
+ *      largest binary buddy of free bits within a dmap word.
+ */
+static int dbMaxBud(u8 * cp)
+{
+        signed char tmp1, tmp2;
+        /* check if the wmap word is all free. if so, the
+         * free buddy size is BUDMIN.
+         */
+        if (*((uint *) cp) == 0)
+                return (BUDMIN);
+        /* check if the wmap word is half free. if so, the
+         * free buddy size is BUDMIN-1.
+         */
+        if (*((u16 *) cp) == 0 || *((u16 *) cp + 1) == 0)
+                return (BUDMIN - 1);
+        /* not all free or half free. determine the free buddy
+         * size thru table lookup using quarters of the wmap word.
+         */
+        tmp1 = max(budtab[cp[2]], budtab[cp[3]]);
+        tmp2 = max(budtab[cp[0]], budtab[cp[1]]);
+        return (max(tmp1, tmp2));
+}
+/*
+ * NAME:        cnttz(uint word)
+ *
+ * FUNCTION:    determine the number of trailing zeros within a 32-bit
+ *              value.
+ *
+ * PARAMETERS:
+ *      value   -  32-bit value to be examined.
+ *
+ * RETURN VALUES:
+ *      count of trailing zeros
+ */
+static int cnttz(u32 word)
+{
+        int n;
+        for (n = 0; n < 32; n++, word >>= 1) {
+                if (word & 0x01)
+                        break;
+        }
+        return (n);
+}
+/*
+ * NAME:        cntlz(u32 value)
+ *
+ * FUNCTION:    determine the number of leading zeros within a 32-bit
+ *              value.
+ *
+ * PARAMETERS:
+ *      value   -  32-bit value to be examined.
+ *
+ * RETURN VALUES:
+ *      count of leading zeros
+ */
+static int cntlz(u32 value)
+{
+        int n;
+        for (n = 0; n < 32; n++, value <<= 1) {
+                if (value & HIGHORDER)
+                        break;
+        }
+        return (n);
+}
+/*
+ * NAME:        blkstol2(s64 nb)
+ *
+ * FUNCTION:    convert a block count to its log2 value. if the block
+ *              count is not a l2 multiple, it is rounded up to the next
+ *              larger l2 multiple.
+ *
+ * PARAMETERS:
+ *      nb      -  number of blocks
+ *
+ * RETURN VALUES:
+ *      log2 number of blocks
+ */
+int blkstol2(s64 nb)
+{
+        int l2nb;
+        s64 mask;               /* meant to be signed */
+        mask = (s64) 1 << (64 - 1);
+        /* count the leading bits.
+         */
+        for (l2nb = 0; l2nb < 64; l2nb++, mask >>= 1) {
+                /* leading bit found.
+                 */
+                if (nb & mask) {
+                        /* determine the l2 value.
+                         */
+                        l2nb = (64 - 1) - l2nb;
+                        /* check if we need to round up.
+                         */
+                        if (~mask & nb)
+                                l2nb++;
+                        return (l2nb);
+                }
+        }
+        assert(0);
+        return 0;               /* fix compiler warning */
+}
+/*
+ * NAME:        dbAllocBottomUp()
+ *
+ * FUNCTION:    alloc the specified block range from the working block
+ *              allocation map.
+ *
+ *              the blocks will be alloc from the working map one dmap
+ *              at a time.
+ *
+ * PARAMETERS:
+ *      ip      -  pointer to in-core inode;
+ *      blkno   -  starting block number to be freed.
+ *      nblocks -  number of blocks to be freed.
+ *
+ * RETURN VALUES:
+ *      0       - success
+ *      -EIO    - i/o error
+ */
+int dbAllocBottomUp(struct inode *ip, s64 blkno, s64 nblocks)
+{
+        struct metapage *mp;
+        struct dmap *dp;
+        int nb, rc;
+        s64 lblkno, rem;
+        struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
+        struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
+        IREAD_LOCK(ipbmap);
+        /* block to be allocated better be within the mapsize. */
+        ASSERT(nblocks <= bmp->db_mapsize - blkno);
+        /*
+         * allocate the blocks a dmap at a time.
+         */
+        mp = NULL;
+        for (rem = nblocks; rem > 0; rem -= nb, blkno += nb) {
+                /* release previous dmap if any */
+                if (mp) {
+                        write_metapage(mp);
+                }
+                /* get the buffer for the current dmap. */
+                lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
+                mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
+                if (mp == NULL) {
+                        IREAD_UNLOCK(ipbmap);
+                        return -EIO;
+                }
+                dp = (struct dmap *) mp->data;
+                /* determine the number of blocks to be allocated from
+                 * this dmap.
+                 */
+                nb = min(rem, BPERDMAP - (blkno & (BPERDMAP - 1)));
+                DBFREECK(bmp->db_DBmap, bmp->db_mapsize, blkno, nb);
+                /* allocate the blocks. */
+                if ((rc = dbAllocDmapBU(bmp, dp, blkno, nb))) {
+                        release_metapage(mp);
+                        IREAD_UNLOCK(ipbmap);
+                        return (rc);
+                }
+                DBALLOC(bmp->db_DBmap, bmp->db_mapsize, blkno, nb);
+        }
+        /* write the last buffer. */
+        write_metapage(mp);
+        IREAD_UNLOCK(ipbmap);
+        return (0);
+}
+static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno,
+                         int nblocks)
+{
+        int rc;
+        int dbitno, word, rembits, nb, nwords, wbitno, agno;
+        s8 oldroot, *leaf;
+        struct dmaptree *tp = (struct dmaptree *) & dp->tree;
+        /* save the current value of the root (i.e. maximum free string)
+         * of the dmap tree.
+         */
+        oldroot = tp->stree[ROOT];
+        /* pick up a pointer to the leaves of the dmap tree */
+        leaf = tp->stree + LEAFIND;
+        /* determine the bit number and word within the dmap of the
+         * starting block.
+         */
+        dbitno = blkno & (BPERDMAP - 1);
+        word = dbitno >> L2DBWORD;
+        /* block range better be within the dmap */
+        assert(dbitno + nblocks <= BPERDMAP);
+        /* allocate the bits of the dmap's words corresponding to the block
+         * range. not all bits of the first and last words may be contained
+         * within the block range.  if this is the case, we'll work against
+         * those words (i.e. partial first and/or last) on an individual basis
+         * (a single pass), allocating the bits of interest by hand and
+         * updating the leaf corresponding to the dmap word. a single pass
+         * will be used for all dmap words fully contained within the
+         * specified range.  within this pass, the bits of all fully contained
+         * dmap words will be marked as free in a single shot and the leaves
+         * will be updated. a single leaf may describe the free space of
+         * multiple dmap words, so we may update only a subset of the actual
+         * leaves corresponding to the dmap words of the block range.
+         */
+        for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) {
+                /* determine the bit number within the word and
+                 * the number of bits within the word.
+                 */
+                wbitno = dbitno & (DBWORD - 1);
+                nb = min(rembits, DBWORD - wbitno);
+                /* check if only part of a word is to be allocated.
+                 */
+                if (nb < DBWORD) {
+                        /* allocate (set to 1) the appropriate bits within
+                         * this dmap word.
+                         */
+                        dp->wmap[word] |= cpu_to_le32(ONES << (DBWORD - nb)
+                                                      >> wbitno);
+                        word++;
+                } else {
+                        /* one or more dmap words are fully contained
+                         * within the block range.  determine how many
+                         * words and allocate (set to 1) the bits of these
+                         * words.
+                         */
+                        nwords = rembits >> L2DBWORD;
+                        memset(&dp->wmap[word], (int) ONES, nwords * 4);
+                        /* determine how many bits */
+                        nb = nwords << L2DBWORD;
+                        word += nwords;
+                }
+        }
+        /* update the free count for this dmap */
+        dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) - nblocks);
+        /* reconstruct summary tree */
+        dbInitDmapTree(dp);
+        BMAP_LOCK(bmp);
+        /* if this allocation group is completely free,
+         * update the highest active allocation group number 
+         * if this allocation group is the new max.
+         */
+        agno = blkno >> bmp->db_agl2size;
+        if (agno > bmp->db_maxag)
+                bmp->db_maxag = agno;
+        /* update the free count for the allocation group and map */
+        bmp->db_agfree[agno] -= nblocks;
+        bmp->db_nfree -= nblocks;
+        BMAP_UNLOCK(bmp);
+        /* if the root has not changed, done. */
+        if (tp->stree[ROOT] == oldroot)
+                return (0);
+        /* root changed. bubble the change up to the dmap control pages.
+         * if the adjustment of the upper level control pages fails,
+         * backout the bit allocation (thus making everything consistent).
+         */
+        if ((rc = dbAdjCtl(bmp, blkno, tp->stree[ROOT], 1, 0)))
+                dbFreeBits(bmp, dp, blkno, nblocks);
+        return (rc);
+}
+/*
+ * NAME:        dbExtendFS()
+ *
+ * FUNCTION:    extend bmap from blkno for nblocks;
+ *              dbExtendFS() updates bmap ready for dbAllocBottomUp();
+ *
+ * L2
+ *  |
+ *   L1---------------------------------L1
+ *    |                                  |
+ *     L0---------L0---------L0           L0---------L0---------L0
+ *      |          |          |            |          |          |
+ *       d0,...,dn  d0,...,dn  d0,...,dn    d0,...,dn  d0,...,dn  d0,.,dm;
+ * L2L1L0d0,...,dnL0d0,...,dnL0d0,...,dnL1L0d0,...,dnL0d0,...,dnL0d0,..dm
+ *
+ * <---old---><----------------------------extend----------------------->   
+ */
+int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks)
+{
+        struct jfs_sb_info *sbi = JFS_SBI(ipbmap->i_sb);
+        int nbperpage = sbi->nbperpage;
+        int i, i0 = TRUE, j, j0 = TRUE, k, n;
+        s64 newsize;
+        s64 p;
+        struct metapage *mp, *l2mp, *l1mp = NULL, *l0mp = NULL;
+        struct dmapctl *l2dcp, *l1dcp, *l0dcp;
+        struct dmap *dp;
+        s8 *l0leaf, *l1leaf, *l2leaf;
+        struct bmap *bmp = sbi->bmap;
+        int agno, l2agsize, oldl2agsize;
+        s64 ag_rem;
+        newsize = blkno + nblocks;
+        jfs_info("dbExtendFS: blkno:%Ld nblocks:%Ld newsize:%Ld",
+                 (long long) blkno, (long long) nblocks, (long long) newsize);
+        /*
+         *      initialize bmap control page.
+         *
+         * all the data in bmap control page should exclude
+         * the mkfs hidden dmap page.
+         */
+        /* update mapsize */
+        bmp->db_mapsize = newsize;
+        bmp->db_maxlevel = BMAPSZTOLEV(bmp->db_mapsize);
+        /* compute new AG size */
+        l2agsize = dbGetL2AGSize(newsize);
+        oldl2agsize = bmp->db_agl2size;
+        bmp->db_agl2size = l2agsize;
+        bmp->db_agsize = 1 << l2agsize;
+        /* compute new number of AG */
+        agno = bmp->db_numag;
+        bmp->db_numag = newsize >> l2agsize;
+        bmp->db_numag += ((u32) newsize % (u32) bmp->db_agsize) ? 1 : 0;
+        /*
+         *      reconfigure db_agfree[] 
+         * from old AG configuration to new AG configuration;
+         *
+         * coalesce contiguous k (newAGSize/oldAGSize) AGs;
+         * i.e., (AGi, ..., AGj) where i = k*n and j = k*(n+1) - 1 to AGn;
+         * note: new AG size = old AG size * (2**x).
+         */
+        if (l2agsize == oldl2agsize)
+                goto extend;
+        k = 1 << (l2agsize - oldl2agsize);
+        ag_rem = bmp->db_agfree[0];     /* save agfree[0] */
+        for (i = 0, n = 0; i < agno; n++) {
+                bmp->db_agfree[n] = 0;  /* init collection point */
+                /* coalesce cotiguous k AGs; */
+                for (j = 0; j < k && i < agno; j++, i++) {
+                        /* merge AGi to AGn */
+                        bmp->db_agfree[n] += bmp->db_agfree[i];
+                }
+        }
+        bmp->db_agfree[0] += ag_rem;    /* restore agfree[0] */
+        for (; n < MAXAG; n++)
+                bmp->db_agfree[n] = 0;
+        /*
+         * update highest active ag number
+         */
+        bmp->db_maxag = bmp->db_maxag / k;
+        /*
+         *      extend bmap
+         *
+         * update bit maps and corresponding level control pages;
+         * global control page db_nfree, db_agfree[agno], db_maxfreebud;
+         */
+      extend:
+        /* get L2 page */
+        p = BMAPBLKNO + nbperpage;      /* L2 page */
+        l2mp = read_metapage(ipbmap, p, PSIZE, 0);
+        if (!l2mp) {
+                jfs_error(ipbmap->i_sb, "dbExtendFS: L2 page could not be read");
+                return -EIO;
+        }
+        l2dcp = (struct dmapctl *) l2mp->data;
+        /* compute start L1 */
+        k = blkno >> L2MAXL1SIZE;
+        l2leaf = l2dcp->stree + CTLLEAFIND + k;
+        p = BLKTOL1(blkno, sbi->l2nbperpage);   /* L1 page */
+        /*
+         * extend each L1 in L2
+         */
+        for (; k < LPERCTL; k++, p += nbperpage) {
+                /* get L1 page */
+                if (j0) {
+                        /* read in L1 page: (blkno & (MAXL1SIZE - 1)) */
+                        l1mp = read_metapage(ipbmap, p, PSIZE, 0);
+                        if (l1mp == NULL)
+                                goto errout;
+                        l1dcp = (struct dmapctl *) l1mp->data;
+                        /* compute start L0 */
+                        j = (blkno & (MAXL1SIZE - 1)) >> L2MAXL0SIZE;
+                        l1leaf = l1dcp->stree + CTLLEAFIND + j;
+                        p = BLKTOL0(blkno, sbi->l2nbperpage);
+                        j0 = FALSE;
+                } else {
+                        /* assign/init L1 page */
+                        l1mp = get_metapage(ipbmap, p, PSIZE, 0);
+                        if (l1mp == NULL)
+                                goto errout;
+                        l1dcp = (struct dmapctl *) l1mp->data;
+                        /* compute start L0 */
+                        j = 0;
+                        l1leaf = l1dcp->stree + CTLLEAFIND;
+                        p += nbperpage; /* 1st L0 of L1.k  */
+                }
+                /*
+                 * extend each L0 in L1
+                 */
+                for (; j < LPERCTL; j++) {
+                        /* get L0 page */
+                        if (i0) {
+                                /* read in L0 page: (blkno & (MAXL0SIZE - 1)) */
+                                l0mp = read_metapage(ipbmap, p, PSIZE, 0);
+                                if (l0mp == NULL)
+                                        goto errout;
+                                l0dcp = (struct dmapctl *) l0mp->data;
+                                /* compute start dmap */
+                                i = (blkno & (MAXL0SIZE - 1)) >>
+                                    L2BPERDMAP;
+                                l0leaf = l0dcp->stree + CTLLEAFIND + i;
+                                p = BLKTODMAP(blkno,
+                                              sbi->l2nbperpage);
+                                i0 = FALSE;
+                        } else {
+                                /* assign/init L0 page */
+                                l0mp = get_metapage(ipbmap, p, PSIZE, 0);
+                                if (l0mp == NULL)
+                                        goto errout;
+                                l0dcp = (struct dmapctl *) l0mp->data;
+                                /* compute start dmap */
+                                i = 0;
+                                l0leaf = l0dcp->stree + CTLLEAFIND;
+                                p += nbperpage; /* 1st dmap of L0.j */
+                        }
+                        /*
+                         * extend each dmap in L0
+                         */
+                        for (; i < LPERCTL; i++) {
+                                /*
+                                 * reconstruct the dmap page, and
+                                 * initialize corresponding parent L0 leaf
+                                 */
+                                if ((n = blkno & (BPERDMAP - 1))) {
+                                        /* read in dmap page: */
+                                        mp = read_metapage(ipbmap, p,
+                                                           PSIZE, 0);
+                                        if (mp == NULL)
+                                                goto errout;
+                                        n = min(nblocks, (s64)BPERDMAP - n);
+                                } else {
+                                        /* assign/init dmap page */
+                                        mp = read_metapage(ipbmap, p,
+                                                           PSIZE, 0);
+                                        if (mp == NULL)
+                                                goto errout;
+                                        n = min(nblocks, (s64)BPERDMAP);
+                                }
+                                dp = (struct dmap *) mp->data;
+                                *l0leaf = dbInitDmap(dp, blkno, n);
+                                bmp->db_nfree += n;
+                                agno = le64_to_cpu(dp->start) >> l2agsize;
+                                bmp->db_agfree[agno] += n;
+                                write_metapage(mp);
+                                l0leaf++;
+                                p += nbperpage;
+                                blkno += n;
+                                nblocks -= n;
+                                if (nblocks == 0)
+                                        break;
+                        }       /* for each dmap in a L0 */
+                        /*
+                         * build current L0 page from its leaves, and 
+                         * initialize corresponding parent L1 leaf
+                         */
+                        *l1leaf = dbInitDmapCtl(l0dcp, 0, ++i);
+                        write_metapage(l0mp);
+                        l0mp = NULL;
+                        if (nblocks)
+                                l1leaf++;       /* continue for next L0 */
+                        else {
+                                /* more than 1 L0 ? */
+                                if (j > 0)
+                                        break;  /* build L1 page */
+                                else {
+                                        /* summarize in global bmap page */
+                                        bmp->db_maxfreebud = *l1leaf;
+                                        release_metapage(l1mp);
+                                        release_metapage(l2mp);
+                                        goto finalize;
+                                }
+                        }
+                }               /* for each L0 in a L1 */
+                /*
+                 * build current L1 page from its leaves, and 
+                 * initialize corresponding parent L2 leaf
+                 */
+                *l2leaf = dbInitDmapCtl(l1dcp, 1, ++j);
+                write_metapage(l1mp);
+                l1mp = NULL;
+                if (nblocks)
+                        l2leaf++;       /* continue for next L1 */
+                else {
+                        /* more than 1 L1 ? */
+                        if (k > 0)
+                                break;  /* build L2 page */
+                        else {
+                                /* summarize in global bmap page */
+                                bmp->db_maxfreebud = *l2leaf;
+                                release_metapage(l2mp);
+                                goto finalize;
+                        }
+                }
+        }                       /* for each L1 in a L2 */
+        jfs_error(ipbmap->i_sb,
+                  "dbExtendFS: function has not returned as expected");
+errout:
+        if (l0mp)
+                release_metapage(l0mp);
+        if (l1mp)
+                release_metapage(l1mp);
+        release_metapage(l2mp);
+        return -EIO;
+        /*
+         *      finalize bmap control page
+         */
+finalize:
+        return 0;
+}
+/*
+ *      dbFinalizeBmap()
+ */
+void dbFinalizeBmap(struct inode *ipbmap)
+{
+        struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
+        int actags, inactags, l2nl;
+        s64 ag_rem, actfree, inactfree, avgfree;
+        int i, n;
+        /*
+         *      finalize bmap control page
+         */
+//finalize:
+        /* 
+         * compute db_agpref: preferred ag to allocate from
+         * (the leftmost ag with average free space in it);
+         */
+//agpref:
+        /* get the number of active ags and inacitve ags */
+        actags = bmp->db_maxag + 1;
+        inactags = bmp->db_numag - actags;
+        ag_rem = bmp->db_mapsize & (bmp->db_agsize - 1);        /* ??? */
+        /* determine how many blocks are in the inactive allocation
+         * groups. in doing this, we must account for the fact that
+         * the rightmost group might be a partial group (i.e. file
+         * system size is not a multiple of the group size).
+         */
+        inactfree = (inactags && ag_rem) ?
+            ((inactags - 1) << bmp->db_agl2size) + ag_rem
+            : inactags << bmp->db_agl2size;
+        /* determine how many free blocks are in the active
+         * allocation groups plus the average number of free blocks
+         * within the active ags.
+         */
+        actfree = bmp->db_nfree - inactfree;
+        avgfree = (u32) actfree / (u32) actags;
+        /* if the preferred allocation group has not average free space.
+         * re-establish the preferred group as the leftmost
+         * group with average free space.
+         */
+        if (bmp->db_agfree[bmp->db_agpref] < avgfree) {
+                for (bmp->db_agpref = 0; bmp->db_agpref < actags;
+                     bmp->db_agpref++) {
+                        if (bmp->db_agfree[bmp->db_agpref] >= avgfree)
+                                break;
+                }
+                if (bmp->db_agpref >= bmp->db_numag) {
+                        jfs_error(ipbmap->i_sb,
+                                  "cannot find ag with average freespace");
+                }
+        }
+        /*
+         * compute db_aglevel, db_agheigth, db_width, db_agstart:
+         * an ag is covered in aglevel dmapctl summary tree, 
+         * at agheight level height (from leaf) with agwidth number of nodes 
+         * each, which starts at agstart index node of the smmary tree node 
+         * array;
+         */
+        bmp->db_aglevel = BMAPSZTOLEV(bmp->db_agsize);
+        l2nl =
+            bmp->db_agl2size - (L2BPERDMAP + bmp->db_aglevel * L2LPERCTL);
+        bmp->db_agheigth = l2nl >> 1;
+        bmp->db_agwidth = 1 << (l2nl - (bmp->db_agheigth << 1));
+        for (i = 5 - bmp->db_agheigth, bmp->db_agstart = 0, n = 1; i > 0;
+             i--) {
+                bmp->db_agstart += n;
+                n <<= 2;
+        }
+}
+/*
+ * NAME:        dbInitDmap()/ujfs_idmap_page()
+ *                                                                    
+ * FUNCTION:    initialize working/persistent bitmap of the dmap page
+ *              for the specified number of blocks:
+ *                                                                    
+ *              at entry, the bitmaps had been initialized as free (ZEROS);
+ *              The number of blocks will only account for the actually 
+ *              existing blocks. Blocks which don't actually exist in 
+ *              the aggregate will be marked as allocated (ONES);
+ *
+ * PARAMETERS:
+ *      dp      - pointer to page of map
+ *      nblocks - number of blocks this page
+ *
+ * RETURNS: NONE
+ */
+static int dbInitDmap(struct dmap * dp, s64 Blkno, int nblocks)
+{
+        int blkno, w, b, r, nw, nb, i;
+        /* starting block number within the dmap */
+        blkno = Blkno & (BPERDMAP - 1);
+        if (blkno == 0) {
+                dp->nblocks = dp->nfree = cpu_to_le32(nblocks);
+                dp->start = cpu_to_le64(Blkno);
+                if (nblocks == BPERDMAP) {
+                        memset(&dp->wmap[0], 0, LPERDMAP * 4);
+                        memset(&dp->pmap[0], 0, LPERDMAP * 4);
+                        goto initTree;
+                }
+        } else {
+                dp->nblocks =
+                    cpu_to_le32(le32_to_cpu(dp->nblocks) + nblocks);
+                dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) + nblocks);
+        }
+        /* word number containing start block number */
+        w = blkno >> L2DBWORD;
+        /*
+         * free the bits corresponding to the block range (ZEROS):
+         * note: not all bits of the first and last words may be contained 
+         * within the block range.
+         */
+        for (r = nblocks; r > 0; r -= nb, blkno += nb) {
+                /* number of bits preceding range to be freed in the word */
+                b = blkno & (DBWORD - 1);
+                /* number of bits to free in the word */
+                nb = min(r, DBWORD - b);
+                /* is partial word to be freed ? */
+                if (nb < DBWORD) {
+                        /* free (set to 0) from the bitmap word */
+                        dp->wmap[w] &= cpu_to_le32(~(ONES << (DBWORD - nb)
+                                                     >> b));
+                        dp->pmap[w] &= cpu_to_le32(~(ONES << (DBWORD - nb)
+                                                     >> b));
+                        /* skip the word freed */
+                        w++;
+                } else {
+                        /* free (set to 0) contiguous bitmap words */
+                        nw = r >> L2DBWORD;
+                        memset(&dp->wmap[w], 0, nw * 4);
+                        memset(&dp->pmap[w], 0, nw * 4);
+                        /* skip the words freed */
+                        nb = nw << L2DBWORD;
+                        w += nw;
+                }
+        }
+        /*
+         * mark bits following the range to be freed (non-existing 
+         * blocks) as allocated (ONES)
+         */
+        if (blkno == BPERDMAP)
+                goto initTree;
+        /* the first word beyond the end of existing blocks */
+        w = blkno >> L2DBWORD;
+        /* does nblocks fall on a 32-bit boundary ? */
+        b = blkno & (DBWORD - 1);
+        if (b) {
+                /* mark a partial word allocated */
+                dp->wmap[w] = dp->pmap[w] = cpu_to_le32(ONES >> b);
+                w++;
+        }
+        /* set the rest of the words in the page to allocated (ONES) */
+        for (i = w; i < LPERDMAP; i++)
+                dp->pmap[i] = dp->wmap[i] = cpu_to_le32(ONES);
+        /*
+         * init tree
+         */
+      initTree:
+        return (dbInitDmapTree(dp));
+}
+/*
+ * NAME:        dbInitDmapTree()/ujfs_complete_dmap()
+ *                                                                    
+ * FUNCTION:    initialize summary tree of the specified dmap:
+ *
+ *              at entry, bitmap of the dmap has been initialized;
+ *                                                                    
+ * PARAMETERS:
+ *      dp      - dmap to complete
+ *      blkno   - starting block number for this dmap
+ *      treemax - will be filled in with max free for this dmap
+ *
+ * RETURNS:     max free string at the root of the tree
+ */
+static int dbInitDmapTree(struct dmap * dp)
+{
+        struct dmaptree *tp;
+        s8 *cp;
+        int i;
+        /* init fixed info of tree */
+        tp = &dp->tree;
+        tp->nleafs = cpu_to_le32(LPERDMAP);
+        tp->l2nleafs = cpu_to_le32(L2LPERDMAP);
+        tp->leafidx = cpu_to_le32(LEAFIND);
+        tp->height = cpu_to_le32(4);
+        tp->budmin = BUDMIN;
+        /* init each leaf from corresponding wmap word:
+         * note: leaf is set to NOFREE(-1) if all blocks of corresponding
+         * bitmap word are allocated. 
+         */
+        cp = tp->stree + le32_to_cpu(tp->leafidx);
+        for (i = 0; i < LPERDMAP; i++)
+                *cp++ = dbMaxBud((u8 *) & dp->wmap[i]);
+        /* build the dmap's binary buddy summary tree */
+        return (dbInitTree(tp));
+}
+/*
+ * NAME:        dbInitTree()/ujfs_adjtree()
+ *                                                                    
+ * FUNCTION:    initialize binary buddy summary tree of a dmap or dmapctl.
+ *
+ *              at entry, the leaves of the tree has been initialized 
+ *              from corresponding bitmap word or root of summary tree
+ *              of the child control page;
+ *              configure binary buddy system at the leaf level, then
+ *              bubble up the values of the leaf nodes up the tree.
+ *
+ * PARAMETERS:
+ *      cp      - Pointer to the root of the tree
+ *      l2leaves- Number of leaf nodes as a power of 2
+ *      l2min   - Number of blocks that can be covered by a leaf
+ *                as a power of 2
+ *
+ * RETURNS: max free string at the root of the tree
+ */
+static int dbInitTree(struct dmaptree * dtp)
+{
+        int l2max, l2free, bsize, nextb, i;
+        int child, parent, nparent;
+        s8 *tp, *cp, *cp1;
+        tp = dtp->stree;
+        /* Determine the maximum free string possible for the leaves */
+        l2max = le32_to_cpu(dtp->l2nleafs) + dtp->budmin;
+        /*
+         * configure the leaf levevl into binary buddy system
+         *
+         * Try to combine buddies starting with a buddy size of 1 
+         * (i.e. two leaves). At a buddy size of 1 two buddy leaves 
+         * can be combined if both buddies have a maximum free of l2min; 
+         * the combination will result in the left-most buddy leaf having 
+         * a maximum free of l2min+1.  
+         * After processing all buddies for a given size, process buddies 
+         * at the next higher buddy size (i.e. current size * 2) and 
+         * the next maximum free (current free + 1).  
+         * This continues until the maximum possible buddy combination 
+         * yields maximum free.
+         */
+        for (l2free = dtp->budmin, bsize = 1; l2free < l2max;
+             l2free++, bsize = nextb) {
+                /* get next buddy size == current buddy pair size */
+                nextb = bsize << 1;
+                /* scan each adjacent buddy pair at current buddy size */
+                for (i = 0, cp = tp + le32_to_cpu(dtp->leafidx);
+                     i < le32_to_cpu(dtp->nleafs);
+                     i += nextb, cp += nextb) {
+                        /* coalesce if both adjacent buddies are max free */
+                        if (*cp == l2free && *(cp + bsize) == l2free) {
+                                *cp = l2free + 1;       /* left take right */
+                                *(cp + bsize) = -1;     /* right give left */
+                        }
+                }
+        }
+        /*
+         * bubble summary information of leaves up the tree.
+         *
+         * Starting at the leaf node level, the four nodes described by
+         * the higher level parent node are compared for a maximum free and 
+         * this maximum becomes the value of the parent node.  
+         * when all lower level nodes are processed in this fashion then 
+         * move up to the next level (parent becomes a lower level node) and 
+         * continue the process for that level.
+         */
+        for (child = le32_to_cpu(dtp->leafidx),
+             nparent = le32_to_cpu(dtp->nleafs) >> 2;
+             nparent > 0; nparent >>= 2, child = parent) {
+                /* get index of 1st node of parent level */
+                parent = (child - 1) >> 2;
+                /* set the value of the parent node as the maximum 
+                 * of the four nodes of the current level.
+                 */
+                for (i = 0, cp = tp + child, cp1 = tp + parent;
+                     i < nparent; i++, cp += 4, cp1++)
+                        *cp1 = TREEMAX(cp);
+        }
+        return (*tp);
+}
+/*
+ *      dbInitDmapCtl()
+ *
+ * function: initialize dmapctl page
+ */
+static int dbInitDmapCtl(struct dmapctl * dcp, int level, int i)
+{                               /* start leaf index not covered by range */
+        s8 *cp;
+        dcp->nleafs = cpu_to_le32(LPERCTL);
+        dcp->l2nleafs = cpu_to_le32(L2LPERCTL);
+        dcp->leafidx = cpu_to_le32(CTLLEAFIND);
+        dcp->height = cpu_to_le32(5);
+        dcp->budmin = L2BPERDMAP + L2LPERCTL * level;
+        /*
+         * initialize the leaves of current level that were not covered 
+         * by the specified input block range (i.e. the leaves have no 
+         * low level dmapctl or dmap).
+         */
+        cp = &dcp->stree[CTLLEAFIND + i];
+        for (; i < LPERCTL; i++)
+                *cp++ = NOFREE;
+        /* build the dmap's binary buddy summary tree */
+        return (dbInitTree((struct dmaptree *) dcp));
+}
+/*
+ * NAME:        dbGetL2AGSize()/ujfs_getagl2size()
+ *                                                                    
+ * FUNCTION:    Determine log2(allocation group size) from aggregate size
+ *                                                                    
+ * PARAMETERS:
+ *      nblocks - Number of blocks in aggregate
+ *
+ * RETURNS: log2(allocation group size) in aggregate blocks
+ */
+static int dbGetL2AGSize(s64 nblocks)
+{
+        s64 sz;
+        s64 m;
+        int l2sz;
+        if (nblocks < BPERDMAP * MAXAG)
+                return (L2BPERDMAP);
+        /* round up aggregate size to power of 2 */
+        m = ((u64) 1 << (64 - 1));
+        for (l2sz = 64; l2sz >= 0; l2sz--, m >>= 1) {
+                if (m & nblocks)
+                        break;
+        }
+        sz = (s64) 1 << l2sz;
+        if (sz < nblocks)
+                l2sz += 1;
+        /* agsize = roundupSize/max_number_of_ag */
+        return (l2sz - L2MAXAG);
+}
+/*
+ * NAME:        dbMapFileSizeToMapSize()
+ *                                                                    
+ * FUNCTION:    compute number of blocks the block allocation map file 
+ *              can cover from the map file size;
+ *
+ * RETURNS:     Number of blocks which can be covered by this block map file;
+ */
+/*
+ * maximum number of map pages at each level including control pages
+ */
+#define MAXL0PAGES      (1 + LPERCTL)
+#define MAXL1PAGES      (1 + LPERCTL * MAXL0PAGES)
+#define MAXL2PAGES      (1 + LPERCTL * MAXL1PAGES)
+/*
+ * convert number of map pages to the zero origin top dmapctl level
+ */
+#define BMAPPGTOLEV(npages)     \
+        (((npages) <= 3 + MAXL0PAGES) ? 0 \
+       : ((npages) <= 2 + MAXL1PAGES) ? 1 : 2)
+s64 dbMapFileSizeToMapSize(struct inode * ipbmap)
+{
+        struct super_block *sb = ipbmap->i_sb;
+        s64 nblocks;
+        s64 npages, ndmaps;
+        int level, i;
+        int complete, factor;
+        nblocks = ipbmap->i_size >> JFS_SBI(sb)->l2bsize;
+        npages = nblocks >> JFS_SBI(sb)->l2nbperpage;
+        level = BMAPPGTOLEV(npages);
+        /* At each level, accumulate the number of dmap pages covered by 
+         * the number of full child levels below it;
+         * repeat for the last incomplete child level.
+         */
+        ndmaps = 0;
+        npages--;               /* skip the first global control page */
+        /* skip higher level control pages above top level covered by map */
+        npages -= (2 - level);
+        npages--;               /* skip top level's control page */
+        for (i = level; i >= 0; i--) {
+                factor =
+                    (i == 2) ? MAXL1PAGES : ((i == 1) ? MAXL0PAGES : 1);
+                complete = (u32) npages / factor;
+                ndmaps += complete * ((i == 2) ? LPERCTL * LPERCTL
+                                      : ((i == 1) ? LPERCTL : 1));
+                /* pages in last/incomplete child */
+                npages = (u32) npages % factor;
+                /* skip incomplete child's level control page */
+                npages--;
+        }
+        /* convert the number of dmaps into the number of blocks 
+         * which can be covered by the dmaps;
+         */
+        nblocks = ndmaps << L2BPERDMAP;
+        return (nblocks);
+}
+#ifdef  _JFS_DEBUG_DMAP
+/*
+ *      DBinitmap()
+ */
+static void DBinitmap(s64 size, struct inode *ipbmap, u32 ** results)
+{
+        int npages;
+        u32 *dbmap, *d;
+        int n;
+        s64 lblkno, cur_block;
+        struct dmap *dp;
+        struct metapage *mp;
+        npages = size / 32768;
+        npages += (size % 32768) ? 1 : 0;
+        dbmap = (u32 *) xmalloc(npages * 4096, L2PSIZE, kernel_heap);
+        if (dbmap == NULL)
+                BUG();  /* Not robust since this is only unused debug code */
+        for (n = 0, d = dbmap; n < npages; n++, d += 1024)
+                bzero(d, 4096);
+        /* Need to initialize from disk map pages
+         */
+        for (d = dbmap, cur_block = 0; cur_block < size;
+             cur_block += BPERDMAP, d += LPERDMAP) {
+                lblkno = BLKTODMAP(cur_block,
+                                   JFS_SBI(ipbmap->i_sb)->bmap->
+                                   db_l2nbperpage);
+                mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
+                if (mp == NULL) {
+                        jfs_error(ipbmap->i_sb,
+                                  "DBinitmap: could not read disk map page");
+                        continue;
+                }
+                dp = (struct dmap *) mp->data;
+                for (n = 0; n < LPERDMAP; n++)
+                        d[n] = le32_to_cpu(dp->wmap[n]);
+                release_metapage(mp);
+        }
+        *results = dbmap;
+}
+/*
+ *      DBAlloc()
+ */
+void DBAlloc(uint * dbmap, s64 mapsize, s64 blkno, s64 nblocks)
+{
+        int word, nb, bitno;
+        u32 mask;
+        assert(blkno > 0 && blkno < mapsize);
+        assert(nblocks > 0 && nblocks <= mapsize);
+        assert(blkno + nblocks <= mapsize);
+        dbmap += (blkno / 32);
+        while (nblocks > 0) {
+                bitno = blkno & (32 - 1);
+                nb = min(nblocks, 32 - bitno);
+                mask = (0xffffffff << (32 - nb) >> bitno);
+                assert((mask & *dbmap) == 0);
+                *dbmap |= mask;
+                dbmap++;
+                blkno += nb;
+                nblocks -= nb;
+        }
+}
+/*
+ *      DBFree()
+ */
+static void DBFree(uint * dbmap, s64 mapsize, s64 blkno, s64 nblocks)
+{
+        int word, nb, bitno;
+        u32 mask;
+        assert(blkno > 0 && blkno < mapsize);
+        assert(nblocks > 0 && nblocks <= mapsize);
+        assert(blkno + nblocks <= mapsize);
+        dbmap += (blkno / 32);
+        while (nblocks > 0) {
+                bitno = blkno & (32 - 1);
+                nb = min(nblocks, 32 - bitno);
+                mask = (0xffffffff << (32 - nb) >> bitno);
+                assert((mask & *dbmap) == mask);
+                *dbmap &= ~mask;
+                dbmap++;
+                blkno += nb;
+                nblocks -= nb;
+        }
+}
+/*
+ *      DBAllocCK()
+ */
+static void DBAllocCK(uint * dbmap, s64 mapsize, s64 blkno, s64 nblocks)
+{
+        int word, nb, bitno;
+        u32 mask;
+        assert(blkno > 0 && blkno < mapsize);
+        assert(nblocks > 0 && nblocks <= mapsize);
+        assert(blkno + nblocks <= mapsize);
+        dbmap += (blkno / 32);
+        while (nblocks > 0) {
+                bitno = blkno & (32 - 1);
+                nb = min(nblocks, 32 - bitno);
+                mask = (0xffffffff << (32 - nb) >> bitno);
+                assert((mask & *dbmap) == mask);
+                dbmap++;
+                blkno += nb;
+                nblocks -= nb;
+        }
+}
+/*
+ *      DBFreeCK()
+ */
+static void DBFreeCK(uint * dbmap, s64 mapsize, s64 blkno, s64 nblocks)
+{
+        int word, nb, bitno;
+        u32 mask;
+        assert(blkno > 0 && blkno < mapsize);
+        assert(nblocks > 0 && nblocks <= mapsize);
+        assert(blkno + nblocks <= mapsize);
+        dbmap += (blkno / 32);
+        while (nblocks > 0) {
+                bitno = blkno & (32 - 1);
+                nb = min(nblocks, 32 - bitno);
+                mask = (0xffffffff << (32 - nb) >> bitno);
+                assert((mask & *dbmap) == 0);
+                dbmap++;
+                blkno += nb;
+                nblocks -= nb;
+        }
+}
+/*
+ *      dbPrtMap()
+ */
+static void dbPrtMap(struct bmap * bmp)
+{
+        printk("   mapsize:   %d%d\n", bmp->db_mapsize);
+        printk("   nfree:     %d%d\n", bmp->db_nfree);
+        printk("   numag:     %d\n", bmp->db_numag);
+        printk("   agsize:    %d%d\n", bmp->db_agsize);
+        printk("   agl2size:  %d\n", bmp->db_agl2size);
+        printk("   agwidth:   %d\n", bmp->db_agwidth);
+        printk("   agstart:   %d\n", bmp->db_agstart);
+        printk("   agheigth:  %d\n", bmp->db_agheigth);
+        printk("   aglevel:   %d\n", bmp->db_aglevel);
+        printk("   maxlevel:  %d\n", bmp->db_maxlevel);
+        printk("   maxag:     %d\n", bmp->db_maxag);
+        printk("   agpref:    %d\n", bmp->db_agpref);
+        printk("   l2nbppg:   %d\n", bmp->db_l2nbperpage);
+}
+/*
+ *      dbPrtCtl()
+ */
+static void dbPrtCtl(struct dmapctl * dcp)
+{
+        int i, j, n;
+        printk("   height:    %08x\n", le32_to_cpu(dcp->height));
+        printk("   leafidx:   %08x\n", le32_to_cpu(dcp->leafidx));
+        printk("   budmin:    %08x\n", dcp->budmin);
+        printk("   nleafs:    %08x\n", le32_to_cpu(dcp->nleafs));
+        printk("   l2nleafs:  %08x\n", le32_to_cpu(dcp->l2nleafs));
+        printk("\n Tree:\n");
+        for (i = 0; i < CTLLEAFIND; i += 8) {
+                n = min(8, CTLLEAFIND - i);
+                for (j = 0; j < n; j++)
+                        printf("  [%03x]: %02x", i + j,
+                               (char) dcp->stree[i + j]);
+                printf("\n");
+        }
+        printk("\n Tree Leaves:\n");
+        for (i = 0; i < LPERCTL; i += 8) {
+                n = min(8, LPERCTL - i);
+                for (j = 0; j < n; j++)
+                        printf("  [%03x]: %02x",
+                               i + j,
+                               (char) dcp->stree[i + j + CTLLEAFIND]);
+                printf("\n");
+        }
+}
+#endif                          /* _JFS_DEBUG_DMAP */
diff --git a/fs/jfs/jfs_dmap.h b/fs/jfs/jfs_dmap.h
new file mode 100644
index 000000000000..32e25884e7e8
--- /dev/null
+++ b/fs/jfs/jfs_dmap.h
@@ -0,0 +1,314 @@
+/*
+ *   Copyright (c) International Business Machines Corp., 2000-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_DMAP
+#define _H_JFS_DMAP
+#include "jfs_txnmgr.h"
+#define BMAPVERSION     1       /* version number */
+#define TREESIZE        (256+64+16+4+1) /* size of a dmap tree */
+#define LEAFIND         (64+16+4+1)     /* index of 1st leaf of a dmap tree */
+#define LPERDMAP        256     /* num leaves per dmap tree */
+#define L2LPERDMAP      8       /* l2 number of leaves per dmap tree */
+#define DBWORD          32      /* # of blks covered by a map word */
+#define L2DBWORD        5       /* l2 # of blks covered by a mword */
+#define BUDMIN          L2DBWORD        /* max free string in a map word */
+#define BPERDMAP        (LPERDMAP * DBWORD)     /* num of blks per dmap */
+#define L2BPERDMAP      13      /* l2 num of blks per dmap */
+#define CTLTREESIZE     (1024+256+64+16+4+1)    /* size of a dmapctl tree */
+#define CTLLEAFIND      (256+64+16+4+1) /* idx of 1st leaf of a dmapctl tree */
+#define LPERCTL         1024    /* num of leaves per dmapctl tree */
+#define L2LPERCTL       10      /* l2 num of leaves per dmapctl tree */
+#define ROOT            0       /* index of the root of a tree */
+#define NOFREE          ((s8) -1)       /* no blocks free */
+#define MAXAG           128     /* max number of allocation groups */
+#define L2MAXAG         7       /* l2 max num of AG */
+#define L2MINAGSZ       25      /* l2 of minimum AG size in bytes */
+#define BMAPBLKNO       0       /* lblkno of bmap within the map */
+/*
+ * maximum l2 number of disk blocks at the various dmapctl levels.
+ */
+#define L2MAXL0SIZE     (L2BPERDMAP + 1 * L2LPERCTL)
+#define L2MAXL1SIZE     (L2BPERDMAP + 2 * L2LPERCTL)
+#define L2MAXL2SIZE     (L2BPERDMAP + 3 * L2LPERCTL)
+/*
+ * maximum number of disk blocks at the various dmapctl levels.
+ */
+#define MAXL0SIZE       ((s64)1 << L2MAXL0SIZE)
+#define MAXL1SIZE       ((s64)1 << L2MAXL1SIZE)
+#define MAXL2SIZE       ((s64)1 << L2MAXL2SIZE)
+#define MAXMAPSIZE      MAXL2SIZE       /* maximum aggregate map size */
+/* 
+ * determine the maximum free string for four (lower level) nodes
+ * of the tree.
+ */
+static __inline signed char TREEMAX(signed char *cp)
+{
+        signed char tmp1, tmp2;
+        tmp1 = max(*(cp+2), *(cp+3));
+        tmp2 = max(*(cp), *(cp+1));
+        return max(tmp1, tmp2);
+}
+/*
+ * convert disk block number to the logical block number of the dmap
+ * describing the disk block.  s is the log2(number of logical blocks per page)
+ *
+ * The calculation figures out how many logical pages are in front of the dmap.
+ *      - the number of dmaps preceding it
+ *      - the number of L0 pages preceding its L0 page
+ *      - the number of L1 pages preceding its L1 page
+ *      - 3 is added to account for the L2, L1, and L0 page for this dmap
+ *      - 1 is added to account for the control page of the map.
+ */
+#define BLKTODMAP(b,s)    \
+        ((((b) >> 13) + ((b) >> 23) + ((b) >> 33) + 3 + 1) << (s))
+/*
+ * convert disk block number to the logical block number of the LEVEL 0
+ * dmapctl describing the disk block.  s is the log2(number of logical blocks
+ * per page)
+ *
+ * The calculation figures out how many logical pages are in front of the L0.
+ *      - the number of dmap pages preceding it
+ *      - the number of L0 pages preceding it
+ *      - the number of L1 pages preceding its L1 page
+ *      - 2 is added to account for the L2, and L1 page for this L0
+ *      - 1 is added to account for the control page of the map.
+ */
+#define BLKTOL0(b,s)      \
+        (((((b) >> 23) << 10) + ((b) >> 23) + ((b) >> 33) + 2 + 1) << (s))
+/*
+ * convert disk block number to the logical block number of the LEVEL 1
+ * dmapctl describing the disk block.  s is the log2(number of logical blocks
+ * per page)
+ *
+ * The calculation figures out how many logical pages are in front of the L1.
+ *      - the number of dmap pages preceding it
+ *      - the number of L0 pages preceding it
+ *      - the number of L1 pages preceding it
+ *      - 1 is added to account for the L2 page
+ *      - 1 is added to account for the control page of the map.
+ */
+#define BLKTOL1(b,s)      \
+     (((((b) >> 33) << 20) + (((b) >> 33) << 10) + ((b) >> 33) + 1 + 1) << (s))
+/*
+ * convert disk block number to the logical block number of the dmapctl
+ * at the specified level which describes the disk block.
+ */
+#define BLKTOCTL(b,s,l)   \
+        (((l) == 2) ? 1 : ((l) == 1) ? BLKTOL1((b),(s)) : BLKTOL0((b),(s)))
+/* 
+ * convert aggregate map size to the zero origin dmapctl level of the
+ * top dmapctl.
+ */
+#define BMAPSZTOLEV(size)       \
+        (((size) <= MAXL0SIZE) ? 0 : ((size) <= MAXL1SIZE) ? 1 : 2)
+/* convert disk block number to allocation group number.
+ */
+#define BLKTOAG(b,sbi)  ((b) >> ((sbi)->bmap->db_agl2size))
+/* convert allocation group number to starting disk block
+ * number.
+ */
+#define AGTOBLK(a,ip)   \
+        ((s64)(a) << (JFS_SBI((ip)->i_sb)->bmap->db_agl2size))
+/*
+ *      dmap summary tree
+ *
+ * dmaptree must be consistent with dmapctl.
+ */
+struct dmaptree {
+        __le32 nleafs;          /* 4: number of tree leafs      */
+        __le32 l2nleafs;        /* 4: l2 number of tree leafs   */
+        __le32 leafidx;         /* 4: index of first tree leaf  */
+        __le32 height;          /* 4: height of the tree        */
+        s8 budmin;              /* 1: min l2 tree leaf value to combine */
+        s8 stree[TREESIZE];     /* TREESIZE: tree               */
+        u8 pad[2];              /* 2: pad to word boundary      */
+};                              /* - 360 -                      */
+/*
+ *      dmap page per 8K blocks bitmap
+ */
+struct dmap {
+        __le32 nblocks;         /* 4: num blks covered by this dmap     */
+        __le32 nfree;           /* 4: num of free blks in this dmap     */
+        __le64 start;           /* 8: starting blkno for this dmap      */
+        struct dmaptree tree;   /* 360: dmap tree                       */
+        u8 pad[1672];           /* 1672: pad to 2048 bytes              */
+        __le32 wmap[LPERDMAP];  /* 1024: bits of the working map        */
+        __le32 pmap[LPERDMAP];  /* 1024: bits of the persistent map     */
+};                              /* - 4096 -                             */
+/*
+ *      disk map control page per level.
+ *
+ * dmapctl must be consistent with dmaptree.
+ */
+struct dmapctl {
+        __le32 nleafs;          /* 4: number of tree leafs      */
+        __le32 l2nleafs;        /* 4: l2 number of tree leafs   */
+        __le32 leafidx;         /* 4: index of the first tree leaf      */
+        __le32 height;          /* 4: height of tree            */
+        s8 budmin;              /* 1: minimum l2 tree leaf value        */
+        s8 stree[CTLTREESIZE];  /* CTLTREESIZE: dmapctl tree    */
+        u8 pad[2714];           /* 2714: pad to 4096            */
+};                              /* - 4096 -                     */
+/*
+ *      common definition for dmaptree within dmap and dmapctl
+ */
+typedef union dmtree {
+        struct dmaptree t1;
+        struct dmapctl t2;
+} dmtree_t;
+/* macros for accessing fields within dmtree */
+#define dmt_nleafs      t1.nleafs
+#define dmt_l2nleafs    t1.l2nleafs
+#define dmt_leafidx     t1.leafidx
+#define dmt_height      t1.height
+#define dmt_budmin      t1.budmin
+#define dmt_stree       t1.stree
+/* 
+ *      on-disk aggregate disk allocation map descriptor.
+ */
+struct dbmap_disk {
+        __le64 dn_mapsize;      /* 8: number of blocks in aggregate     */
+        __le64 dn_nfree;        /* 8: num free blks in aggregate map    */
+        __le32 dn_l2nbperpage;  /* 4: number of blks per page           */
+        __le32 dn_numag;        /* 4: total number of ags               */
+        __le32 dn_maxlevel;     /* 4: number of active ags              */
+        __le32 dn_maxag;        /* 4: max active alloc group number     */
+        __le32 dn_agpref;       /* 4: preferred alloc group (hint)      */
+        __le32 dn_aglevel;      /* 4: dmapctl level holding the AG      */
+        __le32 dn_agheigth;     /* 4: height in dmapctl of the AG       */
+        __le32 dn_agwidth;      /* 4: width in dmapctl of the AG        */
+        __le32 dn_agstart;      /* 4: start tree index at AG height     */
+        __le32 dn_agl2size;     /* 4: l2 num of blks per alloc group    */
+        __le64 dn_agfree[MAXAG];/* 8*MAXAG: per AG free count           */
+        __le64 dn_agsize;       /* 8: num of blks per alloc group       */
+        s8 dn_maxfreebud;       /* 1: max free buddy system             */
+        u8 pad[3007];           /* 3007: pad to 4096                    */
+};                              /* - 4096 -                             */
+struct dbmap {
+        s64 dn_mapsize;         /* number of blocks in aggregate     */
+        s64 dn_nfree;           /* num free blks in aggregate map    */
+        int dn_l2nbperpage;     /* number of blks per page           */
+        int dn_numag;           /* total number of ags               */
+        int dn_maxlevel;        /* number of active ags              */
+        int dn_maxag;           /* max active alloc group number     */
+        int dn_agpref;          /* preferred alloc group (hint)      */
+        int dn_aglevel;         /* dmapctl level holding the AG      */
+        int dn_agheigth;        /* height in dmapctl of the AG       */
+        int dn_agwidth;         /* width in dmapctl of the AG        */
+        int dn_agstart;         /* start tree index at AG height     */
+        int dn_agl2size;        /* l2 num of blks per alloc group    */
+        s64 dn_agfree[MAXAG];   /* per AG free count           */
+        s64 dn_agsize;          /* num of blks per alloc group       */
+        signed char dn_maxfreebud;      /* max free buddy system             */
+};                              /* - 4096 -                             */
+/* 
+ *      in-memory aggregate disk allocation map descriptor.
+ */
+struct bmap {
+        struct dbmap db_bmap;           /* on-disk aggregate map descriptor */
+        struct inode *db_ipbmap;        /* ptr to aggregate map incore inode */
+        struct semaphore db_bmaplock;   /* aggregate map lock */
+        atomic_t db_active[MAXAG];      /* count of active, open files in AG */
+        u32 *db_DBmap;
+};
+/* macros for accessing fields within in-memory aggregate map descriptor */
+#define db_mapsize      db_bmap.dn_mapsize
+#define db_nfree        db_bmap.dn_nfree
+#define db_agfree       db_bmap.dn_agfree
+#define db_agsize       db_bmap.dn_agsize
+#define db_agl2size     db_bmap.dn_agl2size
+#define db_agwidth      db_bmap.dn_agwidth
+#define db_agheigth     db_bmap.dn_agheigth
+#define db_agstart      db_bmap.dn_agstart
+#define db_numag        db_bmap.dn_numag
+#define db_maxlevel     db_bmap.dn_maxlevel
+#define db_aglevel      db_bmap.dn_aglevel
+#define db_agpref       db_bmap.dn_agpref
+#define db_maxag        db_bmap.dn_maxag
+#define db_maxfreebud   db_bmap.dn_maxfreebud
+#define db_l2nbperpage  db_bmap.dn_l2nbperpage
+/*
+ * macros for various conversions needed by the allocators.
+ * blkstol2(), cntlz(), and cnttz() are operating system dependent functions.
+ */
+/* convert number of blocks to log2 number of blocks, rounding up to
+ * the next log2 value if blocks is not a l2 multiple.
+ */
+#define BLKSTOL2(d)             (blkstol2(d))
+/* convert number of leafs to log2 leaf value */
+#define NLSTOL2BSZ(n)           (31 - cntlz((n)) + BUDMIN)
+/* convert leaf index to log2 leaf value */
+#define LITOL2BSZ(n,m,b)        ((((n) == 0) ? (m) : cnttz((n))) + (b))
+/* convert a block number to a dmap control leaf index */
+#define BLKTOCTLLEAF(b,m)       \
+        (((b) & (((s64)1 << ((m) + L2LPERCTL)) - 1)) >> (m))
+/* convert log2 leaf value to buddy size */
+#define BUDSIZE(s,m)            (1 << ((s) - (m)))
+/*
+ *      external references.
+ */
+extern int dbMount(struct inode *ipbmap);
+extern int dbUnmount(struct inode *ipbmap, int mounterror);
+extern int dbFree(struct inode *ipbmap, s64 blkno, s64 nblocks);
+extern int dbUpdatePMap(struct inode *ipbmap,
+                        int free, s64 blkno, s64 nblocks, struct tblock * tblk);
+extern int dbNextAG(struct inode *ipbmap);
+extern int dbAlloc(struct inode *ipbmap, s64 hint, s64 nblocks, s64 * results);
+extern int dbReAlloc(struct inode *ipbmap,
+                     s64 blkno, s64 nblocks, s64 addnblocks, s64 * results);
+extern int dbSync(struct inode *ipbmap);
+extern int dbAllocBottomUp(struct inode *ip, s64 blkno, s64 nblocks);
+extern int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks);
+extern void dbFinalizeBmap(struct inode *ipbmap);
+extern s64 dbMapFileSizeToMapSize(struct inode *ipbmap);
+#endif                          /* _H_JFS_DMAP */
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
new file mode 100644
index 000000000000..e357890adfb2
--- /dev/null
+++ b/fs/jfs/jfs_dtree.c
@@ -0,0 +1,4752 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+/*
+ *      jfs_dtree.c: directory B+-tree manager
+ *
+ * B+-tree with variable length key directory:
+ *
+ * each directory page is structured as an array of 32-byte
+ * directory entry slots initialized as a freelist
+ * to avoid search/compaction of free space at insertion.
+ * when an entry is inserted, a number of slots are allocated
+ * from the freelist as required to store variable length data
+ * of the entry; when the entry is deleted, slots of the entry
+ * are returned to freelist.
+ *
+ * leaf entry stores full name as key and file serial number
+ * (aka inode number) as data.
+ * internal/router entry stores sufffix compressed name
+ * as key and simple extent descriptor as data.
+ *
+ * each directory page maintains a sorted entry index table
+ * which stores the start slot index of sorted entries
+ * to allow binary search on the table.
+ *
+ * directory starts as a root/leaf page in on-disk inode
+ * inline data area.
+ * when it becomes full, it starts a leaf of a external extent
+ * of length of 1 block. each time the first leaf becomes full,
+ * it is extended rather than split (its size is doubled),
+ * until its length becoms 4 KBytes, from then the extent is split
+ * with new 4 Kbyte extent when it becomes full
+ * to reduce external fragmentation of small directories.
+ *
+ * blah, blah, blah, for linear scan of directory in pieces by
+ * readdir().
+ *
+ *
+ *      case-insensitive directory file system
+ *
+ * names are stored in case-sensitive way in leaf entry.
+ * but stored, searched and compared in case-insensitive (uppercase) order
+ * (i.e., both search key and entry key are folded for search/compare):
+ * (note that case-sensitive order is BROKEN in storage, e.g.,
+ *  sensitive: Ad, aB, aC, aD -> insensitive: aB, aC, aD, Ad
+ *
+ *  entries which folds to the same key makes up a equivalent class
+ *  whose members are stored as contiguous cluster (may cross page boundary)
+ *  but whose order is arbitrary and acts as duplicate, e.g.,
+ *  abc, Abc, aBc, abC)
+ *
+ * once match is found at leaf, requires scan forward/backward
+ * either for, in case-insensitive search, duplicate
+ * or for, in case-sensitive search, for exact match
+ *
+ * router entry must be created/stored in case-insensitive way
+ * in internal entry:
+ * (right most key of left page and left most key of right page
+ * are folded, and its suffix compression is propagated as router
+ * key in parent)
+ * (e.g., if split occurs <abc> and <aBd>, <ABD> trather than <aB>
+ * should be made the router key for the split)
+ *
+ * case-insensitive search:
+ *
+ *      fold search key;
+ *
+ *      case-insensitive search of B-tree:
+ *      for internal entry, router key is already folded;
+ *      for leaf entry, fold the entry key before comparison.
+ *
+ *      if (leaf entry case-insensitive match found)
+ *              if (next entry satisfies case-insensitive match)
+ *                      return EDUPLICATE;
+ *              if (prev entry satisfies case-insensitive match)
+ *                      return EDUPLICATE;
+ *              return match;
+ *      else
+ *              return no match;
+ *
+ *      serialization:
+ * target directory inode lock is being held on entry/exit
+ * of all main directory service routines.
+ *
+ *      log based recovery:
+ */
+#include <linux/fs.h>
+#include <linux/quotaops.h>
+#include "jfs_incore.h"
+#include "jfs_superblock.h"
+#include "jfs_filsys.h"
+#include "jfs_metapage.h"
+#include "jfs_dmap.h"
+#include "jfs_unicode.h"
+#include "jfs_debug.h"
+/* dtree split parameter */
+struct dtsplit {
+        struct metapage *mp;
+        s16 index;
+        s16 nslot;
+        struct component_name *key;
+        ddata_t *data;
+        struct pxdlist *pxdlist;
+};
+#define DT_PAGE(IP, MP) BT_PAGE(IP, MP, dtpage_t, i_dtroot)
+/* get page buffer for specified block address */
+#define DT_GETPAGE(IP, BN, MP, SIZE, P, RC)\
+{\
+        BT_GETPAGE(IP, BN, MP, dtpage_t, SIZE, P, RC, i_dtroot)\
+        if (!(RC))\
+        {\
+                if (((P)->header.nextindex > (((BN)==0)?DTROOTMAXSLOT:(P)->header.maxslot)) ||\
+                    ((BN) && ((P)->header.maxslot > DTPAGEMAXSLOT)))\
+                {\
+                        BT_PUTPAGE(MP);\
+                        jfs_error((IP)->i_sb, "DT_GETPAGE: dtree page corrupt");\
+                        MP = NULL;\
+                        RC = -EIO;\
+                }\
+        }\
+}
+/* for consistency */
+#define DT_PUTPAGE(MP) BT_PUTPAGE(MP)
+#define DT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \
+        BT_GETSEARCH(IP, LEAF, BN, MP, dtpage_t, P, INDEX, i_dtroot)
+/*
+ * forward references
+ */
+static int dtSplitUp(tid_t tid, struct inode *ip,
+                     struct dtsplit * split, struct btstack * btstack);
+static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
+                       struct metapage ** rmpp, dtpage_t ** rpp, pxd_t * rxdp);
+static int dtExtendPage(tid_t tid, struct inode *ip,
+                        struct dtsplit * split, struct btstack * btstack);
+static int dtSplitRoot(tid_t tid, struct inode *ip,
+                       struct dtsplit * split, struct metapage ** rmpp);
+static int dtDeleteUp(tid_t tid, struct inode *ip, struct metapage * fmp,
+                      dtpage_t * fp, struct btstack * btstack);
+static int dtRelink(tid_t tid, struct inode *ip, dtpage_t * p);
+static int dtReadFirst(struct inode *ip, struct btstack * btstack);
+static int dtReadNext(struct inode *ip,
+                      loff_t * offset, struct btstack * btstack);
+static int dtCompare(struct component_name * key, dtpage_t * p, int si);
+static int ciCompare(struct component_name * key, dtpage_t * p, int si,
+                     int flag);
+static void dtGetKey(dtpage_t * p, int i, struct component_name * key,
+                     int flag);
+static int ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp,
+                              int ri, struct component_name * key, int flag);
+static void dtInsertEntry(dtpage_t * p, int index, struct component_name * key,
+                          ddata_t * data, struct dt_lock **);
+static void dtMoveEntry(dtpage_t * sp, int si, dtpage_t * dp,
+                        struct dt_lock ** sdtlock, struct dt_lock ** ddtlock,
+                        int do_index);
+static void dtDeleteEntry(dtpage_t * p, int fi, struct dt_lock ** dtlock);
+static void dtTruncateEntry(dtpage_t * p, int ti, struct dt_lock ** dtlock);
+static void dtLinelockFreelist(dtpage_t * p, int m, struct dt_lock ** dtlock);
+#define ciToUpper(c)    UniStrupr((c)->name)
+/*
+ *      read_index_page()
+ *
+ *      Reads a page of a directory's index table.
+ *      Having metadata mapped into the directory inode's address space
+ *      presents a multitude of problems.  We avoid this by mapping to
+ *      the absolute address space outside of the *_metapage routines
+ */
+static struct metapage *read_index_page(struct inode *inode, s64 blkno)
+{
+        int rc;
+        s64 xaddr;
+        int xflag;
+        s32 xlen;
+        rc = xtLookup(inode, blkno, 1, &xflag, &xaddr, &xlen, 1);
+        if (rc || (xlen == 0))
+                return NULL;
+        return read_metapage(inode, xaddr, PSIZE, 1);
+}
+/*
+ *      get_index_page()
+ *
+ *      Same as get_index_page(), but get's a new page without reading
+ */
+static struct metapage *get_index_page(struct inode *inode, s64 blkno)
+{
+        int rc;
+        s64 xaddr;
+        int xflag;
+        s32 xlen;
+        rc = xtLookup(inode, blkno, 1, &xflag, &xaddr, &xlen, 1);
+        if (rc || (xlen == 0))
+                return NULL;
+        return get_metapage(inode, xaddr, PSIZE, 1);
+}
+/*
+ *      find_index()
+ *
+ *      Returns dtree page containing directory table entry for specified
+ *      index and pointer to its entry.
+ *
+ *      mp must be released by caller.
+ */
+static struct dir_table_slot *find_index(struct inode *ip, u32 index,
+                                         struct metapage ** mp, s64 *lblock)
+{
+        struct jfs_inode_info *jfs_ip = JFS_IP(ip);
+        s64 blkno;
+        s64 offset;
+        int page_offset;
+        struct dir_table_slot *slot;
+        static int maxWarnings = 10;
+        if (index < 2) {
+                if (maxWarnings) {
+                        jfs_warn("find_entry called with index = %d", index);
+                        maxWarnings--;
+                }
+                return NULL;
+        }
+        if (index >= jfs_ip->next_index) {
+                jfs_warn("find_entry called with index >= next_index");
+                return NULL;
+        }
+        if (jfs_dirtable_inline(ip)) {
+                /*
+                 * Inline directory table
+                 */
+                *mp = NULL;
+                slot = &jfs_ip->i_dirtable[index - 2];
+        } else {
+                offset = (index - 2) * sizeof(struct dir_table_slot);
+                page_offset = offset & (PSIZE - 1);
+                blkno = ((offset + 1) >> L2PSIZE) <<
+                    JFS_SBI(ip->i_sb)->l2nbperpage;
+                if (*mp && (*lblock != blkno)) {
+                        release_metapage(*mp);
+                        *mp = NULL;
+                }
+                if (*mp == 0) {
+                        *lblock = blkno;
+                        *mp = read_index_page(ip, blkno);
+                }
+                if (*mp == 0) {
+                        jfs_err("free_index: error reading directory table");
+                        return NULL;
+                }
+                slot =
+                    (struct dir_table_slot *) ((char *) (*mp)->data +
+                                               page_offset);
+        }
+        return slot;
+}
+static inline void lock_index(tid_t tid, struct inode *ip, struct metapage * mp,
+                              u32 index)
+{
+        struct tlock *tlck;
+        struct linelock *llck;
+        struct lv *lv;
+        tlck = txLock(tid, ip, mp, tlckDATA);
+        llck = (struct linelock *) tlck->lock;
+        if (llck->index >= llck->maxcnt)
+                llck = txLinelock(llck);
+        lv = &llck->lv[llck->index];
+        /*
+         *      Linelock slot size is twice the size of directory table
+         *      slot size.  512 entries per page.
+         */
+        lv->offset = ((index - 2) & 511) >> 1;
+        lv->length = 1;
+        llck->index++;
+}
+/*
+ *      add_index()
+ *
+ *      Adds an entry to the directory index table.  This is used to provide
+ *      each directory entry with a persistent index in which to resume
+ *      directory traversals
+ */
+static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot)
+{
+        struct super_block *sb = ip->i_sb;
+        struct jfs_sb_info *sbi = JFS_SBI(sb);
+        struct jfs_inode_info *jfs_ip = JFS_IP(ip);
+        u64 blkno;
+        struct dir_table_slot *dirtab_slot;
+        u32 index;
+        struct linelock *llck;
+        struct lv *lv;
+        struct metapage *mp;
+        s64 offset;
+        uint page_offset;
+        struct tlock *tlck;
+        s64 xaddr;
+        ASSERT(DO_INDEX(ip));
+        if (jfs_ip->next_index < 2) {
+                jfs_warn("add_index: next_index = %d.  Resetting!",
+                           jfs_ip->next_index);
+                jfs_ip->next_index = 2;
+        }
+        index = jfs_ip->next_index++;
+        if (index <= MAX_INLINE_DIRTABLE_ENTRY) {
+                /*
+                 * i_size reflects size of index table, or 8 bytes per entry.
+                 */
+                ip->i_size = (loff_t) (index - 1) << 3;
+                /*
+                 * dir table fits inline within inode
+                 */
+                dirtab_slot = &jfs_ip->i_dirtable[index-2];
+                dirtab_slot->flag = DIR_INDEX_VALID;
+                dirtab_slot->slot = slot;
+                DTSaddress(dirtab_slot, bn);
+                set_cflag(COMMIT_Dirtable, ip);
+                return index;
+        }
+        if (index == (MAX_INLINE_DIRTABLE_ENTRY + 1)) {
+                struct dir_table_slot temp_table[12];
+                /*
+                 * It's time to move the inline table to an external
+                 * page and begin to build the xtree
+                 */
+                if (DQUOT_ALLOC_BLOCK(ip, sbi->nbperpage) ||
+                    dbAlloc(ip, 0, sbi->nbperpage, &xaddr))
+                        goto clean_up;  /* No space */
+                /*
+                 * Save the table, we're going to overwrite it with the
+                 * xtree root
+                 */
+                memcpy(temp_table, &jfs_ip->i_dirtable, sizeof(temp_table));
+                /*
+                 * Initialize empty x-tree
+                 */
+                xtInitRoot(tid, ip);
+                /*
+                 * Allocate the first block & add it to the xtree
+                 */
+                if (xtInsert(tid, ip, 0, 0, sbi->nbperpage, &xaddr, 0)) {
+                        /* This really shouldn't fail */
+                        jfs_warn("add_index: xtInsert failed!");
+                        memcpy(&jfs_ip->i_dirtable, temp_table,
+                               sizeof (temp_table));
+                        goto clean_up;
+                }
+                ip->i_size = PSIZE;
+                if ((mp = get_index_page(ip, 0)) == 0) {
+                        jfs_err("add_index: get_metapage failed!");
+                        xtTruncate(tid, ip, 0, COMMIT_PWMAP);
+                        memcpy(&jfs_ip->i_dirtable, temp_table,
+                               sizeof (temp_table));
+                        goto clean_up;
+                }
+                tlck = txLock(tid, ip, mp, tlckDATA);
+                llck = (struct linelock *) & tlck->lock;
+                ASSERT(llck->index == 0);
+                lv = &llck->lv[0];
+                lv->offset = 0;
+                lv->length = 6; /* tlckDATA slot size is 16 bytes */
+                llck->index++;
+                memcpy(mp->data, temp_table, sizeof(temp_table));
+                mark_metapage_dirty(mp);
+                release_metapage(mp);
+                /*
+                 * Logging is now directed by xtree tlocks
+                 */
+                clear_cflag(COMMIT_Dirtable, ip);
+        }
+        offset = (index - 2) * sizeof(struct dir_table_slot);
+        page_offset = offset & (PSIZE - 1);
+        blkno = ((offset + 1) >> L2PSIZE) << sbi->l2nbperpage;
+        if (page_offset == 0) {
+                /*
+                 * This will be the beginning of a new page
+                 */
+                xaddr = 0;
+                if (xtInsert(tid, ip, 0, blkno, sbi->nbperpage, &xaddr, 0)) {
+                        jfs_warn("add_index: xtInsert failed!");
+                        goto clean_up;
+                }
+                ip->i_size += PSIZE;
+                if ((mp = get_index_page(ip, blkno)))
+                        memset(mp->data, 0, PSIZE);     /* Just looks better */
+                else
+                        xtTruncate(tid, ip, offset, COMMIT_PWMAP);
+        } else
+                mp = read_index_page(ip, blkno);
+        if (mp == 0) {
+                jfs_err("add_index: get/read_metapage failed!");
+                goto clean_up;
+        }
+        lock_index(tid, ip, mp, index);
+        dirtab_slot =
+            (struct dir_table_slot *) ((char *) mp->data + page_offset);
+        dirtab_slot->flag = DIR_INDEX_VALID;
+        dirtab_slot->slot = slot;
+        DTSaddress(dirtab_slot, bn);
+        mark_metapage_dirty(mp);
+        release_metapage(mp);
+        return index;
+      clean_up:
+        jfs_ip->next_index--;
+        return 0;
+}
+/*
+ *      free_index()
+ *
+ *      Marks an entry to the directory index table as free.
+ */
+static void free_index(tid_t tid, struct inode *ip, u32 index, u32 next)
+{
+        struct dir_table_slot *dirtab_slot;
+        s64 lblock;
+        struct metapage *mp = NULL;
+        dirtab_slot = find_index(ip, index, &mp, &lblock);
+        if (dirtab_slot == 0)
+                return;
+        dirtab_slot->flag = DIR_INDEX_FREE;
+        dirtab_slot->slot = dirtab_slot->addr1 = 0;
+        dirtab_slot->addr2 = cpu_to_le32(next);
+        if (mp) {
+                lock_index(tid, ip, mp, index);
+                mark_metapage_dirty(mp);
+                release_metapage(mp);
+        } else
+                set_cflag(COMMIT_Dirtable, ip);
+}
+/*
+ *      modify_index()
+ *
+ *      Changes an entry in the directory index table
+ */
+static void modify_index(tid_t tid, struct inode *ip, u32 index, s64 bn,
+                         int slot, struct metapage ** mp, u64 *lblock)
+{
+        struct dir_table_slot *dirtab_slot;
+        dirtab_slot = find_index(ip, index, mp, lblock);
+        if (dirtab_slot == 0)
+                return;
+        DTSaddress(dirtab_slot, bn);
+        dirtab_slot->slot = slot;
+        if (*mp) {
+                lock_index(tid, ip, *mp, index);
+                mark_metapage_dirty(*mp);
+        } else
+                set_cflag(COMMIT_Dirtable, ip);
+}
+/*
+ *      read_index()
+ *
+ *      reads a directory table slot
+ */
+static int read_index(struct inode *ip, u32 index,
+                     struct dir_table_slot * dirtab_slot)
+{
+        s64 lblock;
+        struct metapage *mp = NULL;
+        struct dir_table_slot *slot;
+        slot = find_index(ip, index, &mp, &lblock);
+        if (slot == 0) {
+                return -EIO;
+        }
+        memcpy(dirtab_slot, slot, sizeof(struct dir_table_slot));
+        if (mp)
+                release_metapage(mp);
+        return 0;
+}
+/*
+ *      dtSearch()
+ *
+ * function:
+ *      Search for the entry with specified key
+ *
+ * parameter:
+ *
+ * return: 0 - search result on stack, leaf page pinned;
+ *         errno - I/O error
+ */
+int dtSearch(struct inode *ip, struct component_name * key, ino_t * data,
+             struct btstack * btstack, int flag)
+{
+        int rc = 0;
+        int cmp = 1;            /* init for empty page */
+        s64 bn;
+        struct metapage *mp;
+        dtpage_t *p;
+        s8 *stbl;
+        int base, index, lim;
+        struct btframe *btsp;
+        pxd_t *pxd;
+        int psize = 288;        /* initial in-line directory */
+        ino_t inumber;
+        struct component_name ciKey;
+        struct super_block *sb = ip->i_sb;
+        ciKey.name =
+            (wchar_t *) kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t),
+                                GFP_NOFS);
+        if (ciKey.name == 0) {
+                rc = -ENOMEM;
+                goto dtSearch_Exit2;
+        }
+        /* uppercase search key for c-i directory */
+        UniStrcpy(ciKey.name, key->name);
+        ciKey.namlen = key->namlen;
+        /* only uppercase if case-insensitive support is on */
+        if ((JFS_SBI(sb)->mntflag & JFS_OS2) == JFS_OS2) {
+                ciToUpper(&ciKey);
+        }
+        BT_CLR(btstack);        /* reset stack */
+        /* init level count for max pages to split */
+        btstack->nsplit = 1;
+        /*
+         *      search down tree from root:
+         *
+         * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of
+         * internal page, child page Pi contains entry with k, Ki <= K < Kj.
+         *
+         * if entry with search key K is not found
+         * internal page search find the entry with largest key Ki
+         * less than K which point to the child page to search;
+         * leaf page search find the entry with smallest key Kj
+         * greater than K so that the returned index is the position of
+         * the entry to be shifted right for insertion of new entry.
+         * for empty tree, search key is greater than any key of the tree.
+         *
+         * by convention, root bn = 0.
+         */
+        for (bn = 0;;) {
+                /* get/pin the page to search */
+                DT_GETPAGE(ip, bn, mp, psize, p, rc);
+                if (rc)
+                        goto dtSearch_Exit1;
+                /* get sorted entry table of the page */
+                stbl = DT_GETSTBL(p);
+                /*
+                 * binary search with search key K on the current page.
+                 */
+                for (base = 0, lim = p->header.nextindex; lim; lim >>= 1) {
+                        index = base + (lim >> 1);
+                        if (p->header.flag & BT_LEAF) {
+                                /* uppercase leaf name to compare */
+                                cmp =
+                                    ciCompare(&ciKey, p, stbl[index],
+                                              JFS_SBI(sb)->mntflag);
+                        } else {
+                                /* router key is in uppercase */
+                                cmp = dtCompare(&ciKey, p, stbl[index]);
+                        }
+                        if (cmp == 0) {
+                                /*
+                                 *      search hit
+                                 */
+                                /* search hit - leaf page:
+                                 * return the entry found
+                                 */
+                                if (p->header.flag & BT_LEAF) {
+                                        inumber = le32_to_cpu(
+                        ((struct ldtentry *) & p->slot[stbl[index]])->inumber);
+                                        /*
+                                         * search for JFS_LOOKUP
+                                         */
+                                        if (flag == JFS_LOOKUP) {
+                                                *data = inumber;
+                                                rc = 0;
+                                                goto out;
+                                        }
+                                        /*
+                                         * search for JFS_CREATE
+                                         */
+                                        if (flag == JFS_CREATE) {
+                                                *data = inumber;
+                                                rc = -EEXIST;
+                                                goto out;
+                                        }
+                                        /*
+                                         * search for JFS_REMOVE or JFS_RENAME
+                                         */
+                                        if ((flag == JFS_REMOVE ||
+                                             flag == JFS_RENAME) &&
+                                            *data != inumber) {
+                                                rc = -ESTALE;
+                                                goto out;
+                                        }
+                                        /*
+                                         * JFS_REMOVE|JFS_FINDDIR|JFS_RENAME
+                                         */
+                                        /* save search result */
+                                        *data = inumber;
+                                        btsp = btstack->top;
+                                        btsp->bn = bn;
+                                        btsp->index = index;
+                                        btsp->mp = mp;
+                                        rc = 0;
+                                        goto dtSearch_Exit1;
+                                }
+                                /* search hit - internal page:
+                                 * descend/search its child page
+                                 */
+                                goto getChild;
+                        }
+                        if (cmp > 0) {
+                                base = index + 1;
+                                --lim;
+                        }
+                }
+                /*
+                 *      search miss
+                 *
+                 * base is the smallest index with key (Kj) greater than
+                 * search key (K) and may be zero or (maxindex + 1) index.
+                 */
+                /*
+                 * search miss - leaf page
+                 *
+                 * return location of entry (base) where new entry with
+                 * search key K is to be inserted.
+                 */
+                if (p->header.flag & BT_LEAF) {
+                        /*
+                         * search for JFS_LOOKUP, JFS_REMOVE, or JFS_RENAME
+                         */
+                        if (flag == JFS_LOOKUP || flag == JFS_REMOVE ||
+                            flag == JFS_RENAME) {
+                                rc = -ENOENT;
+                                goto out;
+                        }
+                        /*
+                         * search for JFS_CREATE|JFS_FINDDIR:
+                         *
+                         * save search result
+                         */
+                        *data = 0;
+                        btsp = btstack->top;
+                        btsp->bn = bn;
+                        btsp->index = base;
+                        btsp->mp = mp;
+                        rc = 0;
+                        goto dtSearch_Exit1;
+                }
+                /*
+                 * search miss - internal page
+                 *
+                 * if base is non-zero, decrement base by one to get the parent
+                 * entry of the child page to search.
+                 */
+                index = base ? base - 1 : base;
+                /*
+                 * go down to child page
+                 */
+              getChild:
+                /* update max. number of pages to split */
+                if (BT_STACK_FULL(btstack)) {
+                        /* Something's corrupted, mark filesytem dirty so
+                         * chkdsk will fix it.
+                         */
+                        jfs_error(sb, "stack overrun in dtSearch!");
+                        BT_STACK_DUMP(btstack);
+                        rc = -EIO;
+                        goto out;
+                }
+                btstack->nsplit++;
+                /* push (bn, index) of the parent page/entry */
+                BT_PUSH(btstack, bn, index);
+                /* get the child page block number */
+                pxd = (pxd_t *) & p->slot[stbl[index]];
+                bn = addressPXD(pxd);
+                psize = lengthPXD(pxd) << JFS_SBI(ip->i_sb)->l2bsize;
+                /* unpin the parent page */
+                DT_PUTPAGE(mp);
+        }
+      out:
+        DT_PUTPAGE(mp);
+      dtSearch_Exit1:
+        kfree(ciKey.name);
+      dtSearch_Exit2:
+        return rc;
+}
+/*
+ *      dtInsert()
+ *
+ * function: insert an entry to directory tree
+ *
+ * parameter:
+ *
+ * return: 0 - success;
+ *         errno - failure;
+ */
+int dtInsert(tid_t tid, struct inode *ip,
+         struct component_name * name, ino_t * fsn, struct btstack * btstack)
+{
+        int rc = 0;
+        struct metapage *mp;    /* meta-page buffer */
+        dtpage_t *p;            /* base B+-tree index page */
+        s64 bn;
+        int index;
+        struct dtsplit split;   /* split information */
+        ddata_t data;
+        struct dt_lock *dtlck;
+        int n;
+        struct tlock *tlck;
+        struct lv *lv;
+        /*
+         *      retrieve search result
+         *
+         * dtSearch() returns (leaf page pinned, index at which to insert).
+         * n.b. dtSearch() may return index of (maxindex + 1) of
+         * the full page.
+         */
+        DT_GETSEARCH(ip, btstack->top, bn, mp, p, index);
+        /*
+         *      insert entry for new key
+         */
+        if (DO_INDEX(ip)) {
+                if (JFS_IP(ip)->next_index == DIREND) {
+                        DT_PUTPAGE(mp);
+                        return -EMLINK;
+                }
+                n = NDTLEAF(name->namlen);
+                data.leaf.tid = tid;
+                data.leaf.ip = ip;
+        } else {
+                n = NDTLEAF_LEGACY(name->namlen);
+                data.leaf.ip = NULL;    /* signifies legacy directory format */
+        }
+        data.leaf.ino = *fsn;
+        /*
+         *      leaf page does not have enough room for new entry:
+         *
+         *      extend/split the leaf page;
+         *
+         * dtSplitUp() will insert the entry and unpin the leaf page.
+         */
+        if (n > p->header.freecnt) {
+                split.mp = mp;
+                split.index = index;
+                split.nslot = n;
+                split.key = name;
+                split.data = &data;
+                rc = dtSplitUp(tid, ip, &split, btstack);
+                return rc;
+        }
+        /*
+         *      leaf page does have enough room for new entry:
+         *
+         *      insert the new data entry into the leaf page;
+         */
+        BT_MARK_DIRTY(mp, ip);
+        /*
+         * acquire a transaction lock on the leaf page
+         */
+        tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY);
+        dtlck = (struct dt_lock *) & tlck->lock;
+        ASSERT(dtlck->index == 0);
+        lv = & dtlck->lv[0];
+        /* linelock header */
+        lv->offset = 0;
+        lv->length = 1;
+        dtlck->index++;
+        dtInsertEntry(p, index, name, &data, &dtlck);
+        /* linelock stbl of non-root leaf page */
+        if (!(p->header.flag & BT_ROOT)) {
+                if (dtlck->index >= dtlck->maxcnt)
+                        dtlck = (struct dt_lock *) txLinelock(dtlck);
+                lv = & dtlck->lv[dtlck->index];
+                n = index >> L2DTSLOTSIZE;
+                lv->offset = p->header.stblindex + n;
+                lv->length =
+                    ((p->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1;
+                dtlck->index++;
+        }
+        /* unpin the leaf page */
+        DT_PUTPAGE(mp);
+        return 0;
+}
+/*
+ *      dtSplitUp()
+ *
+ * function: propagate insertion bottom up;
+ *
+ * parameter:
+ *
+ * return: 0 - success;
+ *         errno - failure;
+ *      leaf page unpinned;
+ */
+static int dtSplitUp(tid_t tid,
+          struct inode *ip, struct dtsplit * split, struct btstack * btstack)
+{
+        struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
+        int rc = 0;
+        struct metapage *smp;
+        dtpage_t *sp;           /* split page */
+        struct metapage *rmp;
+        dtpage_t *rp;           /* new right page split from sp */
+        pxd_t rpxd;             /* new right page extent descriptor */
+        struct metapage *lmp;
+        dtpage_t *lp;           /* left child page */
+        int skip;               /* index of entry of insertion */
+        struct btframe *parent; /* parent page entry on traverse stack */
+        s64 xaddr, nxaddr;
+        int xlen, xsize;
+        struct pxdlist pxdlist;
+        pxd_t *pxd;
+        struct component_name key = { 0, NULL };
+        ddata_t *data = split->data;
+        int n;
+        struct dt_lock *dtlck;
+        struct tlock *tlck;
+        struct lv *lv;
+        int quota_allocation = 0;
+        /* get split page */
+        smp = split->mp;
+        sp = DT_PAGE(ip, smp);
+        key.name =
+            (wchar_t *) kmalloc((JFS_NAME_MAX + 2) * sizeof(wchar_t),
+                                GFP_NOFS);
+        if (key.name == 0) {
+                DT_PUTPAGE(smp);
+                rc = -ENOMEM;
+                goto dtSplitUp_Exit;
+        }
+        /*
+         *      split leaf page
+         *
+         * The split routines insert the new entry, and
+         * acquire txLock as appropriate.
+         */
+        /*
+         *      split root leaf page:
+         */
+        if (sp->header.flag & BT_ROOT) {
+                /*
+                 * allocate a single extent child page
+                 */
+                xlen = 1;
+                n = sbi->bsize >> L2DTSLOTSIZE;
+                n -= (n + 31) >> L2DTSLOTSIZE;  /* stbl size */
+                n -= DTROOTMAXSLOT - sp->header.freecnt; /* header + entries */
+                if (n <= split->nslot)
+                        xlen++;
+                if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr))) {
+                        DT_PUTPAGE(smp);
+                        goto freeKeyName;
+                }
+                pxdlist.maxnpxd = 1;
+                pxdlist.npxd = 0;
+                pxd = &pxdlist.pxd[0];
+                PXDaddress(pxd, xaddr);
+                PXDlength(pxd, xlen);
+                split->pxdlist = &pxdlist;
+                rc = dtSplitRoot(tid, ip, split, &rmp);
+                if (rc)
+                        dbFree(ip, xaddr, xlen);
+                else
+                        DT_PUTPAGE(rmp);
+                DT_PUTPAGE(smp);
+                goto freeKeyName;
+        }
+        /*
+         *      extend first leaf page
+         *
+         * extend the 1st extent if less than buffer page size
+         * (dtExtendPage() reurns leaf page unpinned)
+         */
+        pxd = &sp->header.self;
+        xlen = lengthPXD(pxd);
+        xsize = xlen << sbi->l2bsize;
+        if (xsize < PSIZE) {
+                xaddr = addressPXD(pxd);
+                n = xsize >> L2DTSLOTSIZE;
+                n -= (n + 31) >> L2DTSLOTSIZE;  /* stbl size */
+                if ((n + sp->header.freecnt) <= split->nslot)
+                        n = xlen + (xlen << 1);
+                else
+                        n = xlen;
+                /* Allocate blocks to quota. */
+                if (DQUOT_ALLOC_BLOCK(ip, n)) {
+                        rc = -EDQUOT;
+                        goto extendOut;
+                }
+                quota_allocation += n;
+                if ((rc = dbReAlloc(sbi->ipbmap, xaddr, (s64) xlen,
+                                    (s64) n, &nxaddr)))
+                        goto extendOut;
+                pxdlist.maxnpxd = 1;
+                pxdlist.npxd = 0;
+                pxd = &pxdlist.pxd[0];
+                PXDaddress(pxd, nxaddr)
+                    PXDlength(pxd, xlen + n);
+                split->pxdlist = &pxdlist;
+                if ((rc = dtExtendPage(tid, ip, split, btstack))) {
+                        nxaddr = addressPXD(pxd);
+                        if (xaddr != nxaddr) {
+                                /* free relocated extent */
+                                xlen = lengthPXD(pxd);
+                                dbFree(ip, nxaddr, (s64) xlen);
+                        } else {
+                                /* free extended delta */
+                                xlen = lengthPXD(pxd) - n;
+                                xaddr = addressPXD(pxd) + xlen;
+                                dbFree(ip, xaddr, (s64) n);
+                        }
+                }
+              extendOut:
+                DT_PUTPAGE(smp);
+                goto freeKeyName;
+        }
+        /*
+         *      split leaf page <sp> into <sp> and a new right page <rp>.
+         *
+         * return <rp> pinned and its extent descriptor <rpxd>
+         */
+        /*
+         * allocate new directory page extent and
+         * new index page(s) to cover page split(s)
+         *
+         * allocation hint: ?
+         */
+        n = btstack->nsplit;
+        pxdlist.maxnpxd = pxdlist.npxd = 0;
+        xlen = sbi->nbperpage;
+        for (pxd = pxdlist.pxd; n > 0; n--, pxd++) {
+                if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr)) == 0) {
+                        PXDaddress(pxd, xaddr);
+                        PXDlength(pxd, xlen);
+                        pxdlist.maxnpxd++;
+                        continue;
+                }
+                DT_PUTPAGE(smp);
+                /* undo allocation */
+                goto splitOut;
+        }
+        split->pxdlist = &pxdlist;
+        if ((rc = dtSplitPage(tid, ip, split, &rmp, &rp, &rpxd))) {
+                DT_PUTPAGE(smp);
+                /* undo allocation */
+                goto splitOut;
+        }
+        /*
+         * propagate up the router entry for the leaf page just split
+         *
+         * insert a router entry for the new page into the parent page,
+         * propagate the insert/split up the tree by walking back the stack
+         * of (bn of parent page, index of child page entry in parent page)
+         * that were traversed during the search for the page that split.
+         *
+         * the propagation of insert/split up the tree stops if the root
+         * splits or the page inserted into doesn't have to split to hold
+         * the new entry.
+         *
+         * the parent entry for the split page remains the same, and
+         * a new entry is inserted at its right with the first key and
+         * block number of the new right page.
+         *
+         * There are a maximum of 4 pages pinned at any time:
+         * two children, left parent and right parent (when the parent splits).
+         * keep the child pages pinned while working on the parent.
+         * make sure that all pins are released at exit.
+         */
+        while ((parent = BT_POP(btstack)) != NULL) {
+                /* parent page specified by stack frame <parent> */
+                /* keep current child pages (<lp>, <rp>) pinned */
+                lmp = smp;
+                lp = sp;
+                /*
+                 * insert router entry in parent for new right child page <rp>
+                 */
+                /* get the parent page <sp> */
+                DT_GETPAGE(ip, parent->bn, smp, PSIZE, sp, rc);
+                if (rc) {
+                        DT_PUTPAGE(lmp);
+                        DT_PUTPAGE(rmp);
+                        goto splitOut;
+                }
+                /*
+                 * The new key entry goes ONE AFTER the index of parent entry,
+                 * because the split was to the right.
+                 */
+                skip = parent->index + 1;
+                /*
+                 * compute the key for the router entry
+                 *
+                 * key suffix compression:
+                 * for internal pages that have leaf pages as children,
+                 * retain only what's needed to distinguish between
+                 * the new entry and the entry on the page to its left.
+                 * If the keys compare equal, retain the entire key.
+                 *
+                 * note that compression is performed only at computing
+                 * router key at the lowest internal level.
+                 * further compression of the key between pairs of higher
+                 * level internal pages loses too much information and
+                 * the search may fail.
+                 * (e.g., two adjacent leaf pages of {a, ..., x} {xx, ...,}
+                 * results in two adjacent parent entries (a)(xx).
+                 * if split occurs between these two entries, and
+                 * if compression is applied, the router key of parent entry
+                 * of right page (x) will divert search for x into right
+                 * subtree and miss x in the left subtree.)
+                 *
+                 * the entire key must be retained for the next-to-leftmost
+                 * internal key at any level of the tree, or search may fail
+                 * (e.g., ?)
+                 */
+                switch (rp->header.flag & BT_TYPE) {
+                case BT_LEAF:
+                        /*
+                         * compute the length of prefix for suffix compression
+                         * between last entry of left page and first entry
+                         * of right page
+                         */
+                        if ((sp->header.flag & BT_ROOT && skip > 1) ||
+                            sp->header.prev != 0 || skip > 1) {
+                                /* compute uppercase router prefix key */
+                                rc = ciGetLeafPrefixKey(lp,
+                                                        lp->header.nextindex-1,
+                                                        rp, 0, &key,
+                                                        sbi->mntflag);
+                                if (rc) {
+                                        DT_PUTPAGE(lmp);
+                                        DT_PUTPAGE(rmp);
+                                        DT_PUTPAGE(smp);
+                                        goto splitOut;
+                                }
+                        } else {
+                                /* next to leftmost entry of
+                                   lowest internal level */
+                                /* compute uppercase router key */
+                                dtGetKey(rp, 0, &key, sbi->mntflag);
+                                key.name[key.namlen] = 0;
+                                if ((sbi->mntflag & JFS_OS2) == JFS_OS2)
+                                        ciToUpper(&key);
+                        }
+                        n = NDTINTERNAL(key.namlen);
+                        break;
+                case BT_INTERNAL:
+                        dtGetKey(rp, 0, &key, sbi->mntflag);
+                        n = NDTINTERNAL(key.namlen);
+                        break;
+                default:
+                        jfs_err("dtSplitUp(): UFO!");
+                        break;
+                }
+                /* unpin left child page */
+                DT_PUTPAGE(lmp);
+                /*
+                 * compute the data for the router entry
+                 */
+                data->xd = rpxd;        /* child page xd */
+                /*
+                 * parent page is full - split the parent page
+                 */
+                if (n > sp->header.freecnt) {
+                        /* init for parent page split */
+                        split->mp = smp;
+                        split->index = skip;    /* index at insert */
+                        split->nslot = n;
+                        split->key = &key;
+                        /* split->data = data; */
+                        /* unpin right child page */
+                        DT_PUTPAGE(rmp);
+                        /* The split routines insert the new entry,
+                         * acquire txLock as appropriate.
+                         * return <rp> pinned and its block number <rbn>.
+                         */
+                        rc = (sp->header.flag & BT_ROOT) ?
+                            dtSplitRoot(tid, ip, split, &rmp) :
+                            dtSplitPage(tid, ip, split, &rmp, &rp, &rpxd);
+                        if (rc) {
+                                DT_PUTPAGE(smp);
+                                goto splitOut;
+                        }
+                        /* smp and rmp are pinned */
+                }
+                /*
+                 * parent page is not full - insert router entry in parent page
+                 */
+                else {
+                        BT_MARK_DIRTY(smp, ip);
+                        /*
+                         * acquire a transaction lock on the parent page
+                         */
+                        tlck = txLock(tid, ip, smp, tlckDTREE | tlckENTRY);
+                        dtlck = (struct dt_lock *) & tlck->lock;
+                        ASSERT(dtlck->index == 0);
+                        lv = & dtlck->lv[0];
+                        /* linelock header */
+                        lv->offset = 0;
+                        lv->length = 1;
+                        dtlck->index++;
+                        /* linelock stbl of non-root parent page */
+                        if (!(sp->header.flag & BT_ROOT)) {
+                                lv++;
+                                n = skip >> L2DTSLOTSIZE;
+                                lv->offset = sp->header.stblindex + n;
+                                lv->length =
+                                    ((sp->header.nextindex -
+                                      1) >> L2DTSLOTSIZE) - n + 1;
+                                dtlck->index++;
+                        }
+                        dtInsertEntry(sp, skip, &key, data, &dtlck);
+                        /* exit propagate up */
+                        break;
+                }
+        }
+        /* unpin current split and its right page */
+        DT_PUTPAGE(smp);
+        DT_PUTPAGE(rmp);
+        /*
+         * free remaining extents allocated for split
+         */
+      splitOut:
+        n = pxdlist.npxd;
+        pxd = &pxdlist.pxd[n];
+        for (; n < pxdlist.maxnpxd; n++, pxd++)
+                dbFree(ip, addressPXD(pxd), (s64) lengthPXD(pxd));
+      freeKeyName:
+        kfree(key.name);
+        /* Rollback quota allocation */
+        if (rc && quota_allocation)
+                DQUOT_FREE_BLOCK(ip, quota_allocation);
+      dtSplitUp_Exit:
+        return rc;
+}
+/*
+ *      dtSplitPage()
+ *
+ * function: Split a non-root page of a btree.
+ *
+ * parameter:
+ *
+ * return: 0 - success;
+ *         errno - failure;
+ *      return split and new page pinned;
+ */
+static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
+            struct metapage ** rmpp, dtpage_t ** rpp, pxd_t * rpxdp)
+{
+        int rc = 0;
+        struct metapage *smp;
+        dtpage_t *sp;
+        struct metapage *rmp;
+        dtpage_t *rp;           /* new right page allocated */
+        s64 rbn;                /* new right page block number */
+        struct metapage *mp;
+        dtpage_t *p;
+        s64 nextbn;
+        struct pxdlist *pxdlist;
+        pxd_t *pxd;
+        int skip, nextindex, half, left, nxt, off, si;
+        struct ldtentry *ldtentry;
+        struct idtentry *idtentry;
+        u8 *stbl;
+        struct dtslot *f;
+        int fsi, stblsize;
+        int n;
+        struct dt_lock *sdtlck, *rdtlck;
+        struct tlock *tlck;
+        struct dt_lock *dtlck;
+        struct lv *slv, *rlv, *lv;
+        /* get split page */
+        smp = split->mp;
+        sp = DT_PAGE(ip, smp);
+        /*
+         * allocate the new right page for the split
+         */
+        pxdlist = split->pxdlist;
+        pxd = &pxdlist->pxd[pxdlist->npxd];
+        pxdlist->npxd++;
+        rbn = addressPXD(pxd);
+        rmp = get_metapage(ip, rbn, PSIZE, 1);
+        if (rmp == NULL)
+                return -EIO;
+        /* Allocate blocks to quota. */
+        if (DQUOT_ALLOC_BLOCK(ip, lengthPXD(pxd))) {
+                release_metapage(rmp);
+                return -EDQUOT;
+        }
+        jfs_info("dtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp);
+        BT_MARK_DIRTY(rmp, ip);
+        /*
+         * acquire a transaction lock on the new right page
+         */
+        tlck = txLock(tid, ip, rmp, tlckDTREE | tlckNEW);
+        rdtlck = (struct dt_lock *) & tlck->lock;
+        rp = (dtpage_t *) rmp->data;
+        *rpp = rp;
+        rp->header.self = *pxd;
+        BT_MARK_DIRTY(smp, ip);
+        /*
+         * acquire a transaction lock on the split page
+         *
+         * action:
+         */
+        tlck = txLock(tid, ip, smp, tlckDTREE | tlckENTRY);
+        sdtlck = (struct dt_lock *) & tlck->lock;
+        /* linelock header of split page */
+        ASSERT(sdtlck->index == 0);
+        slv = & sdtlck->lv[0];
+        slv->offset = 0;
+        slv->length = 1;
+        sdtlck->index++;
+        /*
+         * initialize/update sibling pointers between sp and rp
+         */
+        nextbn = le64_to_cpu(sp->header.next);
+        rp->header.next = cpu_to_le64(nextbn);
+        rp->header.prev = cpu_to_le64(addressPXD(&sp->header.self));
+        sp->header.next = cpu_to_le64(rbn);
+        /*
+         * initialize new right page
+         */
+        rp->header.flag = sp->header.flag;
+        /* compute sorted entry table at start of extent data area */
+        rp->header.nextindex = 0;
+        rp->header.stblindex = 1;
+        n = PSIZE >> L2DTSLOTSIZE;
+        rp->header.maxslot = n;
+        stblsize = (n + 31) >> L2DTSLOTSIZE;    /* in unit of slot */
+        /* init freelist */
+        fsi = rp->header.stblindex + stblsize;
+        rp->header.freelist = fsi;
+        rp->header.freecnt = rp->header.maxslot - fsi;
+        /*
+         *      sequential append at tail: append without split
+         *
+         * If splitting the last page on a level because of appending
+         * a entry to it (skip is maxentry), it's likely that the access is
+         * sequential. Adding an empty page on the side of the level is less
+         * work and can push the fill factor much higher than normal.
+         * If we're wrong it's no big deal, we'll just do the split the right
+         * way next time.
+         * (It may look like it's equally easy to do a similar hack for
+         * reverse sorted data, that is, split the tree left,
+         * but it's not. Be my guest.)
+         */
+        if (nextbn == 0 && split->index == sp->header.nextindex) {
+                /* linelock header + stbl (first slot) of new page */
+                rlv = & rdtlck->lv[rdtlck->index];
+                rlv->offset = 0;
+                rlv->length = 2;
+                rdtlck->index++;
+                /*
+                 * initialize freelist of new right page
+                 */
+                f = &rp->slot[fsi];
+                for (fsi++; fsi < rp->header.maxslot; f++, fsi++)
+                        f->next = fsi;
+                f->next = -1;
+                /* insert entry at the first entry of the new right page */
+                dtInsertEntry(rp, 0, split->key, split->data, &rdtlck);
+                goto out;
+        }
+        /*
+         *      non-sequential insert (at possibly middle page)
+         */
+        /*
+         * update prev pointer of previous right sibling page;
+         */
+        if (nextbn != 0) {
+                DT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc);
+                if (rc) {
+                        discard_metapage(rmp);
+                        return rc;
+                }
+                BT_MARK_DIRTY(mp, ip);
+                /*
+                 * acquire a transaction lock on the next page
+                 */
+                tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK);
+                jfs_info("dtSplitPage: tlck = 0x%p, ip = 0x%p, mp=0x%p",
+                        tlck, ip, mp);
+                dtlck = (struct dt_lock *) & tlck->lock;
+                /* linelock header of previous right sibling page */
+                lv = & dtlck->lv[dtlck->index];
+                lv->offset = 0;
+                lv->length = 1;
+                dtlck->index++;
+                p->header.prev = cpu_to_le64(rbn);
+                DT_PUTPAGE(mp);
+        }
+        /*
+         * split the data between the split and right pages.
+         */
+        skip = split->index;
+        half = (PSIZE >> L2DTSLOTSIZE) >> 1;    /* swag */
+        left = 0;
+        /*
+         *      compute fill factor for split pages
+         *
+         * <nxt> traces the next entry to move to rp
+         * <off> traces the next entry to stay in sp
+         */
+        stbl = (u8 *) & sp->slot[sp->header.stblindex];
+        nextindex = sp->header.nextindex;
+        for (nxt = off = 0; nxt < nextindex; ++off) {
+                if (off == skip)
+                        /* check for fill factor with new entry size */
+                        n = split->nslot;
+                else {
+                        si = stbl[nxt];
+                        switch (sp->header.flag & BT_TYPE) {
+                        case BT_LEAF:
+                                ldtentry = (struct ldtentry *) & sp->slot[si];
+                                if (DO_INDEX(ip))
+                                        n = NDTLEAF(ldtentry->namlen);
+                                else
+                                        n = NDTLEAF_LEGACY(ldtentry->
+                                                           namlen);
+                                break;
+                        case BT_INTERNAL:
+                                idtentry = (struct idtentry *) & sp->slot[si];
+                                n = NDTINTERNAL(idtentry->namlen);
+                                break;
+                        default:
+                                break;
+                        }
+                        ++nxt;  /* advance to next entry to move in sp */
+                }
+                left += n;
+                if (left >= half)
+                        break;
+        }
+        /* <nxt> poins to the 1st entry to move */
+        /*
+         *      move entries to right page
+         *
+         * dtMoveEntry() initializes rp and reserves entry for insertion
+         *
+         * split page moved out entries are linelocked;
+         * new/right page moved in entries are linelocked;
+         */
+        /* linelock header + stbl of new right page */
+        rlv = & rdtlck->lv[rdtlck->index];
+        rlv->offset = 0;
+        rlv->length = 5;
+        rdtlck->index++;
+        dtMoveEntry(sp, nxt, rp, &sdtlck, &rdtlck, DO_INDEX(ip));
+        sp->header.nextindex = nxt;
+        /*
+         * finalize freelist of new right page
+         */
+        fsi = rp->header.freelist;
+        f = &rp->slot[fsi];
+        for (fsi++; fsi < rp->header.maxslot; f++, fsi++)
+                f->next = fsi;
+        f->next = -1;
+        /*
+         * Update directory index table for entries now in right page
+         */
+        if ((rp->header.flag & BT_LEAF) && DO_INDEX(ip)) {
+                s64 lblock;
+                mp = NULL;
+                stbl = DT_GETSTBL(rp);
+                for (n = 0; n < rp->header.nextindex; n++) {
+                        ldtentry = (struct ldtentry *) & rp->slot[stbl[n]];
+                        modify_index(tid, ip, le32_to_cpu(ldtentry->index),
+                                     rbn, n, &mp, &lblock);
+                }
+                if (mp)
+                        release_metapage(mp);
+        }
+        /*
+         * the skipped index was on the left page,
+         */
+        if (skip <= off) {
+                /* insert the new entry in the split page */
+                dtInsertEntry(sp, skip, split->key, split->data, &sdtlck);
+                /* linelock stbl of split page */
+                if (sdtlck->index >= sdtlck->maxcnt)
+                        sdtlck = (struct dt_lock *) txLinelock(sdtlck);
+                slv = & sdtlck->lv[sdtlck->index];
+                n = skip >> L2DTSLOTSIZE;
+                slv->offset = sp->header.stblindex + n;
+                slv->length =
+                    ((sp->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1;
+                sdtlck->index++;
+        }
+        /*
+         * the skipped index was on the right page,
+         */
+        else {
+                /* adjust the skip index to reflect the new position */
+                skip -= nxt;
+                /* insert the new entry in the right page */
+                dtInsertEntry(rp, skip, split->key, split->data, &rdtlck);
+        }
+      out:
+        *rmpp = rmp;
+        *rpxdp = *pxd;
+        return rc;
+}
+/*
+ *      dtExtendPage()
+ *
+ * function: extend 1st/only directory leaf page
+ *
+ * parameter:
+ *
+ * return: 0 - success;
+ *         errno - failure;
+ *      return extended page pinned;
+ */
+static int dtExtendPage(tid_t tid,
+             struct inode *ip, struct dtsplit * split, struct btstack * btstack)
+{
+        struct super_block *sb = ip->i_sb;
+        int rc;
+        struct metapage *smp, *pmp, *mp;
+        dtpage_t *sp, *pp;
+        struct pxdlist *pxdlist;
+        pxd_t *pxd, *tpxd;
+        int xlen, xsize;
+        int newstblindex, newstblsize;
+        int oldstblindex, oldstblsize;
+        int fsi, last;
+        struct dtslot *f;
+        struct btframe *parent;
+        int n;
+        struct dt_lock *dtlck;
+        s64 xaddr, txaddr;
+        struct tlock *tlck;
+        struct pxd_lock *pxdlock;
+        struct lv *lv;
+        uint type;
+        struct ldtentry *ldtentry;
+        u8 *stbl;
+        /* get page to extend */
+        smp = split->mp;
+        sp = DT_PAGE(ip, smp);
+        /* get parent/root page */
+        parent = BT_POP(btstack);
+        DT_GETPAGE(ip, parent->bn, pmp, PSIZE, pp, rc);
+        if (rc)
+                return (rc);
+        /*
+         *      extend the extent
+         */
+        pxdlist = split->pxdlist;
+        pxd = &pxdlist->pxd[pxdlist->npxd];
+        pxdlist->npxd++;
+        xaddr = addressPXD(pxd);
+        tpxd = &sp->header.self;
+        txaddr = addressPXD(tpxd);
+        /* in-place extension */
+        if (xaddr == txaddr) {
+                type = tlckEXTEND;
+        }
+        /* relocation */
+        else {
+                type = tlckNEW;
+                /* save moved extent descriptor for later free */
+                tlck = txMaplock(tid, ip, tlckDTREE | tlckRELOCATE);
+                pxdlock = (struct pxd_lock *) & tlck->lock;
+                pxdlock->flag = mlckFREEPXD;
+                pxdlock->pxd = sp->header.self;
+                pxdlock->index = 1;
+                /*
+                 * Update directory index table to reflect new page address
+                 */
+                if (DO_INDEX(ip)) {
+                        s64 lblock;
+                        mp = NULL;
+                        stbl = DT_GETSTBL(sp);
+                        for (n = 0; n < sp->header.nextindex; n++) {
+                                ldtentry =
+                                    (struct ldtentry *) & sp->slot[stbl[n]];
+                                modify_index(tid, ip,
+                                             le32_to_cpu(ldtentry->index),
+                                             xaddr, n, &mp, &lblock);
+                        }
+                        if (mp)
+                                release_metapage(mp);
+                }
+        }
+        /*
+         *      extend the page
+         */
+        sp->header.self = *pxd;
+        jfs_info("dtExtendPage: ip:0x%p smp:0x%p sp:0x%p", ip, smp, sp);
+        BT_MARK_DIRTY(smp, ip);
+        /*
+         * acquire a transaction lock on the extended/leaf page
+         */
+        tlck = txLock(tid, ip, smp, tlckDTREE | type);
+        dtlck = (struct dt_lock *) & tlck->lock;
+        lv = & dtlck->lv[0];
+        /* update buffer extent descriptor of extended page */
+        xlen = lengthPXD(pxd);
+        xsize = xlen << JFS_SBI(sb)->l2bsize;
+#ifdef _STILL_TO_PORT
+        bmSetXD(smp, xaddr, xsize);
+#endif                          /*  _STILL_TO_PORT */
+        /*
+         * copy old stbl to new stbl at start of extended area
+         */
+        oldstblindex = sp->header.stblindex;
+        oldstblsize = (sp->header.maxslot + 31) >> L2DTSLOTSIZE;
+        newstblindex = sp->header.maxslot;
+        n = xsize >> L2DTSLOTSIZE;
+        newstblsize = (n + 31) >> L2DTSLOTSIZE;
+        memcpy(&sp->slot[newstblindex], &sp->slot[oldstblindex],
+               sp->header.nextindex);
+        /*
+         * in-line extension: linelock old area of extended page
+         */
+        if (type == tlckEXTEND) {
+                /* linelock header */
+                lv->offset = 0;
+                lv->length = 1;
+                dtlck->index++;
+                lv++;
+                /* linelock new stbl of extended page */
+                lv->offset = newstblindex;
+                lv->length = newstblsize;
+        }
+        /*
+         * relocation: linelock whole relocated area
+         */
+        else {
+                lv->offset = 0;
+                lv->length = sp->header.maxslot + newstblsize;
+        }
+        dtlck->index++;
+        sp->header.maxslot = n;
+        sp->header.stblindex = newstblindex;
+        /* sp->header.nextindex remains the same */
+        /*
+         * add old stbl region at head of freelist
+         */
+        fsi = oldstblindex;
+        f = &sp->slot[fsi];
+        last = sp->header.freelist;
+        for (n = 0; n < oldstblsize; n++, fsi++, f++) {
+                f->next = last;
+                last = fsi;
+        }
+        sp->header.freelist = last;
+        sp->header.freecnt += oldstblsize;
+        /*
+         * append free region of newly extended area at tail of freelist
+         */
+        /* init free region of newly extended area */
+        fsi = n = newstblindex + newstblsize;
+        f = &sp->slot[fsi];
+        for (fsi++; fsi < sp->header.maxslot; f++, fsi++)
+                f->next = fsi;
+        f->next = -1;
+        /* append new free region at tail of old freelist */
+        fsi = sp->header.freelist;
+        if (fsi == -1)
+                sp->header.freelist = n;
+        else {
+                do {
+                        f = &sp->slot[fsi];
+                        fsi = f->next;
+                } while (fsi != -1);
+                f->next = n;
+        }
+        sp->header.freecnt += sp->header.maxslot - n;
+        /*
+         * insert the new entry
+         */
+        dtInsertEntry(sp, split->index, split->key, split->data, &dtlck);
+        BT_MARK_DIRTY(pmp, ip);
+        /*
+         * linelock any freeslots residing in old extent
+         */
+        if (type == tlckEXTEND) {
+                n = sp->header.maxslot >> 2;
+                if (sp->header.freelist < n)
+                        dtLinelockFreelist(sp, n, &dtlck);
+        }
+        /*
+         *      update parent entry on the parent/root page
+         */
+        /*
+         * acquire a transaction lock on the parent/root page
+         */
+        tlck = txLock(tid, ip, pmp, tlckDTREE | tlckENTRY);
+        dtlck = (struct dt_lock *) & tlck->lock;
+        lv = & dtlck->lv[dtlck->index];
+        /* linelock parent entry - 1st slot */
+        lv->offset = 1;
+        lv->length = 1;
+        dtlck->index++;
+        /* update the parent pxd for page extension */
+        tpxd = (pxd_t *) & pp->slot[1];
+        *tpxd = *pxd;
+        DT_PUTPAGE(pmp);
+        return 0;
+}
+/*
+ *      dtSplitRoot()
+ *
+ * function:
+ *      split the full root page into
+ *      original/root/split page and new right page
+ *      i.e., root remains fixed in tree anchor (inode) and
+ *      the root is copied to a single new right child page
+ *      since root page << non-root page, and
+ *      the split root page contains a single entry for the
+ *      new right child page.
+ *
+ * parameter:
+ *
+ * return: 0 - success;
+ *         errno - failure;
+ *      return new page pinned;
+ */
+static int dtSplitRoot(tid_t tid,
+            struct inode *ip, struct dtsplit * split, struct metapage ** rmpp)
+{
+        struct super_block *sb = ip->i_sb;
+        struct metapage *smp;
+        dtroot_t *sp;
+        struct metapage *rmp;
+        dtpage_t *rp;
+        s64 rbn;
+        int xlen;
+        int xsize;
+        struct dtslot *f;
+        s8 *stbl;
+        int fsi, stblsize, n;
+        struct idtentry *s;
+        pxd_t *ppxd;
+        struct pxdlist *pxdlist;
+        pxd_t *pxd;
+        struct dt_lock *dtlck;
+        struct tlock *tlck;
+        struct lv *lv;
+        /* get split root page */
+        smp = split->mp;
+        sp = &JFS_IP(ip)->i_dtroot;
+        /*
+         *      allocate/initialize a single (right) child page
+         *
+         * N.B. at first split, a one (or two) block to fit new entry
+         * is allocated; at subsequent split, a full page is allocated;
+         */
+        pxdlist = split->pxdlist;
+        pxd = &pxdlist->pxd[pxdlist->npxd];
+        pxdlist->npxd++;
+        rbn = addressPXD(pxd);
+        xlen = lengthPXD(pxd);
+        xsize = xlen << JFS_SBI(sb)->l2bsize;
+        rmp = get_metapage(ip, rbn, xsize, 1);
+        if (!rmp)
+                return -EIO;
+        rp = rmp->data;
+        /* Allocate blocks to quota. */
+        if (DQUOT_ALLOC_BLOCK(ip, lengthPXD(pxd))) {
+                release_metapage(rmp);
+                return -EDQUOT;
+        }
+        BT_MARK_DIRTY(rmp, ip);
+        /*
+         * acquire a transaction lock on the new right page
+         */
+        tlck = txLock(tid, ip, rmp, tlckDTREE | tlckNEW);
+        dtlck = (struct dt_lock *) & tlck->lock;
+        rp->header.flag =
+            (sp->header.flag & BT_LEAF) ? BT_LEAF : BT_INTERNAL;
+        rp->header.self = *pxd;
+        /* initialize sibling pointers */
+        rp->header.next = 0;
+        rp->header.prev = 0;
+        /*
+         *      move in-line root page into new right page extent
+         */
+        /* linelock header + copied entries + new stbl (1st slot) in new page */
+        ASSERT(dtlck->index == 0);
+        lv = & dtlck->lv[0];
+        lv->offset = 0;
+        lv->length = 10;        /* 1 + 8 + 1 */
+        dtlck->index++;
+        n = xsize >> L2DTSLOTSIZE;
+        rp->header.maxslot = n;
+        stblsize = (n + 31) >> L2DTSLOTSIZE;
+        /* copy old stbl to new stbl at start of extended area */
+        rp->header.stblindex = DTROOTMAXSLOT;
+        stbl = (s8 *) & rp->slot[DTROOTMAXSLOT];
+        memcpy(stbl, sp->header.stbl, sp->header.nextindex);
+        rp->header.nextindex = sp->header.nextindex;
+        /* copy old data area to start of new data area */
+        memcpy(&rp->slot[1], &sp->slot[1], IDATASIZE);
+        /*
+         * append free region of newly extended area at tail of freelist
+         */
+        /* init free region of newly extended area */
+        fsi = n = DTROOTMAXSLOT + stblsize;
+        f = &rp->slot[fsi];
+        for (fsi++; fsi < rp->header.maxslot; f++, fsi++)
+                f->next = fsi;
+        f->next = -1;
+        /* append new free region at tail of old freelist */
+        fsi = sp->header.freelist;
+        if (fsi == -1)
+                rp->header.freelist = n;
+        else {
+                rp->header.freelist = fsi;
+                do {
+                        f = &rp->slot[fsi];
+                        fsi = f->next;
+                } while (fsi != -1);
+                f->next = n;
+        }
+        rp->header.freecnt = sp->header.freecnt + rp->header.maxslot - n;
+        /*
+         * Update directory index table for entries now in right page
+         */
+        if ((rp->header.flag & BT_LEAF) && DO_INDEX(ip)) {
+                s64 lblock;
+                struct metapage *mp = NULL;
+                struct ldtentry *ldtentry;
+                stbl = DT_GETSTBL(rp);
+                for (n = 0; n < rp->header.nextindex; n++) {
+                        ldtentry = (struct ldtentry *) & rp->slot[stbl[n]];
+                        modify_index(tid, ip, le32_to_cpu(ldtentry->index),
+                                     rbn, n, &mp, &lblock);
+                }
+                if (mp)
+                        release_metapage(mp);
+        }
+        /*
+         * insert the new entry into the new right/child page
+         * (skip index in the new right page will not change)
+         */
+        dtInsertEntry(rp, split->index, split->key, split->data, &dtlck);
+        /*
+         *      reset parent/root page
+         *
+         * set the 1st entry offset to 0, which force the left-most key
+         * at any level of the tree to be less than any search key.
+         *
+         * The btree comparison code guarantees that the left-most key on any
+         * level of the tree is never used, so it doesn't need to be filled in.
+         */
+        BT_MARK_DIRTY(smp, ip);
+        /*
+         * acquire a transaction lock on the root page (in-memory inode)
+         */
+        tlck = txLock(tid, ip, smp, tlckDTREE | tlckNEW | tlckBTROOT);
+        dtlck = (struct dt_lock *) & tlck->lock;
+        /* linelock root */
+        ASSERT(dtlck->index == 0);
+        lv = & dtlck->lv[0];
+        lv->offset = 0;
+        lv->length = DTROOTMAXSLOT;
+        dtlck->index++;
+        /* update page header of root */
+        if (sp->header.flag & BT_LEAF) {
+                sp->header.flag &= ~BT_LEAF;
+                sp->header.flag |= BT_INTERNAL;
+        }
+        /* init the first entry */
+        s = (struct idtentry *) & sp->slot[DTENTRYSTART];
+        ppxd = (pxd_t *) s;
+        *ppxd = *pxd;
+        s->next = -1;
+        s->namlen = 0;
+        stbl = sp->header.stbl;
+        stbl[0] = DTENTRYSTART;
+        sp->header.nextindex = 1;
+        /* init freelist */
+        fsi = DTENTRYSTART + 1;
+        f = &sp->slot[fsi];
+        /* init free region of remaining area */
+        for (fsi++; fsi < DTROOTMAXSLOT; f++, fsi++)
+                f->next = fsi;
+        f->next = -1;
+        sp->header.freelist = DTENTRYSTART + 1;
+        sp->header.freecnt = DTROOTMAXSLOT - (DTENTRYSTART + 1);
+        *rmpp = rmp;
+        return 0;
+}
+/*
+ *      dtDelete()
+ *
+ * function: delete the entry(s) referenced by a key.
+ *
+ * parameter:
+ *
+ * return:
+ */
+int dtDelete(tid_t tid,
+         struct inode *ip, struct component_name * key, ino_t * ino, int flag)
+{
+        int rc = 0;
+        s64 bn;
+        struct metapage *mp, *imp;
+        dtpage_t *p;
+        int index;
+        struct btstack btstack;
+        struct dt_lock *dtlck;
+        struct tlock *tlck;
+        struct lv *lv;
+        int i;
+        struct ldtentry *ldtentry;
+        u8 *stbl;
+        u32 table_index, next_index;
+        struct metapage *nmp;
+        dtpage_t *np;
+        /*
+         *      search for the entry to delete:
+         *
+         * dtSearch() returns (leaf page pinned, index at which to delete).
+         */
+        if ((rc = dtSearch(ip, key, ino, &btstack, flag)))
+                return rc;
+        /* retrieve search result */
+        DT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
+        /*
+         * We need to find put the index of the next entry into the
+         * directory index table in order to resume a readdir from this
+         * entry.
+         */
+        if (DO_INDEX(ip)) {
+                stbl = DT_GETSTBL(p);
+                ldtentry = (struct ldtentry *) & p->slot[stbl[index]];
+                table_index = le32_to_cpu(ldtentry->index);
+                if (index == (p->header.nextindex - 1)) {
+                        /*
+                         * Last entry in this leaf page
+                         */
+                        if ((p->header.flag & BT_ROOT)
+                            || (p->header.next == 0))
+                                next_index = -1;
+                        else {
+                                /* Read next leaf page */
+                                DT_GETPAGE(ip, le64_to_cpu(p->header.next),
+                                           nmp, PSIZE, np, rc);
+                                if (rc)
+                                        next_index = -1;
+                                else {
+                                        stbl = DT_GETSTBL(np);
+                                        ldtentry =
+                                            (struct ldtentry *) & np->
+                                            slot[stbl[0]];
+                                        next_index =
+                                            le32_to_cpu(ldtentry->index);
+                                        DT_PUTPAGE(nmp);
+                                }
+                        }
+                } else {
+                        ldtentry =
+                            (struct ldtentry *) & p->slot[stbl[index + 1]];
+                        next_index = le32_to_cpu(ldtentry->index);
+                }
+                free_index(tid, ip, table_index, next_index);
+        }
+        /*
+         * the leaf page becomes empty, delete the page
+         */
+        if (p->header.nextindex == 1) {
+                /* delete empty page */
+                rc = dtDeleteUp(tid, ip, mp, p, &btstack);
+        }
+        /*
+         * the leaf page has other entries remaining:
+         *
+         * delete the entry from the leaf page.
+         */
+        else {
+                BT_MARK_DIRTY(mp, ip);
+                /*
+                 * acquire a transaction lock on the leaf page
+                 */
+                tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY);
+                dtlck = (struct dt_lock *) & tlck->lock;
+                /*
+                 * Do not assume that dtlck->index will be zero.  During a
+                 * rename within a directory, this transaction may have
+                 * modified this page already when adding the new entry.
+                 */
+                /* linelock header */
+                if (dtlck->index >= dtlck->maxcnt)
+                        dtlck = (struct dt_lock *) txLinelock(dtlck);
+                lv = & dtlck->lv[dtlck->index];
+                lv->offset = 0;
+                lv->length = 1;
+                dtlck->index++;
+                /* linelock stbl of non-root leaf page */
+                if (!(p->header.flag & BT_ROOT)) {
+                        if (dtlck->index >= dtlck->maxcnt)
+                                dtlck = (struct dt_lock *) txLinelock(dtlck);
+                        lv = & dtlck->lv[dtlck->index];
+                        i = index >> L2DTSLOTSIZE;
+                        lv->offset = p->header.stblindex + i;
+                        lv->length =
+                            ((p->header.nextindex - 1) >> L2DTSLOTSIZE) -
+                            i + 1;
+                        dtlck->index++;
+                }
+                /* free the leaf entry */
+                dtDeleteEntry(p, index, &dtlck);
+                /*
+                 * Update directory index table for entries moved in stbl
+                 */
+                if (DO_INDEX(ip) && index < p->header.nextindex) {
+                        s64 lblock;
+                        imp = NULL;
+                        stbl = DT_GETSTBL(p);
+                        for (i = index; i < p->header.nextindex; i++) {
+                                ldtentry =
+                                    (struct ldtentry *) & p->slot[stbl[i]];
+                                modify_index(tid, ip,
+                                             le32_to_cpu(ldtentry->index),
+                                             bn, i, &imp, &lblock);
+                        }
+                        if (imp)
+                                release_metapage(imp);
+                }
+                DT_PUTPAGE(mp);
+        }
+        return rc;
+}
+/*
+ *      dtDeleteUp()
+ *
+ * function:
+ *      free empty pages as propagating deletion up the tree
+ *
+ * parameter:
+ *
+ * return:
+ */
+static int dtDeleteUp(tid_t tid, struct inode *ip,
+           struct metapage * fmp, dtpage_t * fp, struct btstack * btstack)
+{
+        int rc = 0;
+        struct metapage *mp;
+        dtpage_t *p;
+        int index, nextindex;
+        int xlen;
+        struct btframe *parent;
+        struct dt_lock *dtlck;
+        struct tlock *tlck;
+        struct lv *lv;
+        struct pxd_lock *pxdlock;
+        int i;
+        /*
+         *      keep the root leaf page which has become empty
+         */
+        if (BT_IS_ROOT(fmp)) {
+                /*
+                 * reset the root
+                 *
+                 * dtInitRoot() acquires txlock on the root
+                 */
+                dtInitRoot(tid, ip, PARENT(ip));
+                DT_PUTPAGE(fmp);
+                return 0;
+        }
+        /*
+         *      free the non-root leaf page
+         */
+        /*
+         * acquire a transaction lock on the page
+         *
+         * write FREEXTENT|NOREDOPAGE log record
+         * N.B. linelock is overlaid as freed extent descriptor, and
+         * the buffer page is freed;
+         */
+        tlck = txMaplock(tid, ip, tlckDTREE | tlckFREE);
+        pxdlock = (struct pxd_lock *) & tlck->lock;
+        pxdlock->flag = mlckFREEPXD;
+        pxdlock->pxd = fp->header.self;
+        pxdlock->index = 1;
+        /* update sibling pointers */
+        if ((rc = dtRelink(tid, ip, fp))) {
+                BT_PUTPAGE(fmp);
+                return rc;
+        }
+        xlen = lengthPXD(&fp->header.self);
+        /* Free quota allocation. */
+        DQUOT_FREE_BLOCK(ip, xlen);
+        /* free/invalidate its buffer page */
+        discard_metapage(fmp);
+        /*
+         *      propagate page deletion up the directory tree
+         *
+         * If the delete from the parent page makes it empty,
+         * continue all the way up the tree.
+         * stop if the root page is reached (which is never deleted) or
+         * if the entry deletion does not empty the page.
+         */
+        while ((parent = BT_POP(btstack)) != NULL) {
+                /* pin the parent page <sp> */
+                DT_GETPAGE(ip, parent->bn, mp, PSIZE, p, rc);
+                if (rc)
+                        return rc;
+                /*
+                 * free the extent of the child page deleted
+                 */
+                index = parent->index;
+                /*
+                 * delete the entry for the child page from parent
+                 */
+                nextindex = p->header.nextindex;
+                /*
+                 * the parent has the single entry being deleted:
+                 *
+                 * free the parent page which has become empty.
+                 */
+                if (nextindex == 1) {
+                        /*
+                         * keep the root internal page which has become empty
+                         */
+                        if (p->header.flag & BT_ROOT) {
+                                /*
+                                 * reset the root
+                                 *
+                                 * dtInitRoot() acquires txlock on the root
+                                 */
+                                dtInitRoot(tid, ip, PARENT(ip));
+                                DT_PUTPAGE(mp);
+                                return 0;
+                        }
+                        /*
+                         * free the parent page
+                         */
+                        else {
+                                /*
+                                 * acquire a transaction lock on the page
+                                 *
+                                 * write FREEXTENT|NOREDOPAGE log record
+                                 */
+                                tlck =
+                                    txMaplock(tid, ip,
+                                              tlckDTREE | tlckFREE);
+                                pxdlock = (struct pxd_lock *) & tlck->lock;
+                                pxdlock->flag = mlckFREEPXD;
+                                pxdlock->pxd = p->header.self;
+                                pxdlock->index = 1;
+                                /* update sibling pointers */
+                                if ((rc = dtRelink(tid, ip, p))) {
+                                        DT_PUTPAGE(mp);
+                                        return rc;
+                                }
+                                xlen = lengthPXD(&p->header.self);
+                                /* Free quota allocation */
+                                DQUOT_FREE_BLOCK(ip, xlen);
+                                /* free/invalidate its buffer page */
+                                discard_metapage(mp);
+                                /* propagate up */
+                                continue;
+                        }
+                }
+                /*
+                 * the parent has other entries remaining:
+                 *
+                 * delete the router entry from the parent page.
+                 */
+                BT_MARK_DIRTY(mp, ip);
+                /*
+                 * acquire a transaction lock on the page
+                 *
+                 * action: router entry deletion
+                 */
+                tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY);
+                dtlck = (struct dt_lock *) & tlck->lock;
+                /* linelock header */
+                if (dtlck->index >= dtlck->maxcnt)
+                        dtlck = (struct dt_lock *) txLinelock(dtlck);
+                lv = & dtlck->lv[dtlck->index];
+                lv->offset = 0;
+                lv->length = 1;
+                dtlck->index++;
+                /* linelock stbl of non-root leaf page */
+                if (!(p->header.flag & BT_ROOT)) {
+                        if (dtlck->index < dtlck->maxcnt)
+                                lv++;
+                        else {
+                                dtlck = (struct dt_lock *) txLinelock(dtlck);
+                                lv = & dtlck->lv[0];
+                        }
+                        i = index >> L2DTSLOTSIZE;
+                        lv->offset = p->header.stblindex + i;
+                        lv->length =
+                            ((p->header.nextindex - 1) >> L2DTSLOTSIZE) -
+                            i + 1;
+                        dtlck->index++;
+                }
+                /* free the router entry */
+                dtDeleteEntry(p, index, &dtlck);
+                /* reset key of new leftmost entry of level (for consistency) */
+                if (index == 0 &&
+                    ((p->header.flag & BT_ROOT) || p->header.prev == 0))
+                        dtTruncateEntry(p, 0, &dtlck);
+                /* unpin the parent page */
+                DT_PUTPAGE(mp);
+                /* exit propagation up */
+                break;
+        }
+        return 0;
+}
+#ifdef _NOTYET
+/*
+ * NAME:        dtRelocate()
+ *
+ * FUNCTION:    relocate dtpage (internal or leaf) of directory;
+ *              This function is mainly used by defragfs utility.
+ */
+int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd,
+               s64 nxaddr)
+{
+        int rc = 0;
+        struct metapage *mp, *pmp, *lmp, *rmp;
+        dtpage_t *p, *pp, *rp = 0, *lp= 0;
+        s64 bn;
+        int index;
+        struct btstack btstack;
+        pxd_t *pxd;
+        s64 oxaddr, nextbn, prevbn;
+        int xlen, xsize;
+        struct tlock *tlck;
+        struct dt_lock *dtlck;
+        struct pxd_lock *pxdlock;
+        s8 *stbl;
+        struct lv *lv;
+        oxaddr = addressPXD(opxd);
+        xlen = lengthPXD(opxd);
+        jfs_info("dtRelocate: lmxaddr:%Ld xaddr:%Ld:%Ld xlen:%d",
+                   (long long)lmxaddr, (long long)oxaddr, (long long)nxaddr,
+                   xlen);
+        /*
+         *      1. get the internal parent dtpage covering
+         *      router entry for the tartget page to be relocated;
+         */
+        rc = dtSearchNode(ip, lmxaddr, opxd, &btstack);
+        if (rc)
+                return rc;
+        /* retrieve search result */
+        DT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index);
+        jfs_info("dtRelocate: parent router entry validated.");
+        /*
+         *      2. relocate the target dtpage
+         */
+        /* read in the target page from src extent */
+        DT_GETPAGE(ip, oxaddr, mp, PSIZE, p, rc);
+        if (rc) {
+                /* release the pinned parent page */
+                DT_PUTPAGE(pmp);
+                return rc;
+        }
+        /*
+         * read in sibling pages if any to update sibling pointers;
+         */
+        rmp = NULL;
+        if (p->header.next) {
+                nextbn = le64_to_cpu(p->header.next);
+                DT_GETPAGE(ip, nextbn, rmp, PSIZE, rp, rc);
+                if (rc) {
+                        DT_PUTPAGE(mp);
+                        DT_PUTPAGE(pmp);
+                        return (rc);
+                }
+        }
+        lmp = NULL;
+        if (p->header.prev) {
+                prevbn = le64_to_cpu(p->header.prev);
+                DT_GETPAGE(ip, prevbn, lmp, PSIZE, lp, rc);
+                if (rc) {
+                        DT_PUTPAGE(mp);
+                        DT_PUTPAGE(pmp);
+                        if (rmp)
+                                DT_PUTPAGE(rmp);
+                        return (rc);
+                }
+        }
+        /* at this point, all xtpages to be updated are in memory */
+        /*
+         * update sibling pointers of sibling dtpages if any;
+         */
+        if (lmp) {
+                tlck = txLock(tid, ip, lmp, tlckDTREE | tlckRELINK);
+                dtlck = (struct dt_lock *) & tlck->lock;
+                /* linelock header */
+                ASSERT(dtlck->index == 0);
+                lv = & dtlck->lv[0];
+                lv->offset = 0;
+                lv->length = 1;
+                dtlck->index++;
+                lp->header.next = cpu_to_le64(nxaddr);
+                DT_PUTPAGE(lmp);
+        }
+        if (rmp) {
+                tlck = txLock(tid, ip, rmp, tlckDTREE | tlckRELINK);
+                dtlck = (struct dt_lock *) & tlck->lock;
+                /* linelock header */
+                ASSERT(dtlck->index == 0);
+                lv = & dtlck->lv[0];
+                lv->offset = 0;
+                lv->length = 1;
+                dtlck->index++;
+                rp->header.prev = cpu_to_le64(nxaddr);
+                DT_PUTPAGE(rmp);
+        }
+        /*
+         * update the target dtpage to be relocated
+         *
+         * write LOG_REDOPAGE of LOG_NEW type for dst page
+         * for the whole target page (logredo() will apply
+         * after image and update bmap for allocation of the
+         * dst extent), and update bmap for allocation of
+         * the dst extent;
+         */
+        tlck = txLock(tid, ip, mp, tlckDTREE | tlckNEW);
+        dtlck = (struct dt_lock *) & tlck->lock;
+        /* linelock header */
+        ASSERT(dtlck->index == 0);
+        lv = & dtlck->lv[0];
+        /* update the self address in the dtpage header */
+        pxd = &p->header.self;
+        PXDaddress(pxd, nxaddr);
+        /* the dst page is the same as the src page, i.e.,
+         * linelock for afterimage of the whole page;
+         */
+        lv->offset = 0;
+        lv->length = p->header.maxslot;
+        dtlck->index++;
+        /* update the buffer extent descriptor of the dtpage */
+        xsize = xlen << JFS_SBI(ip->i_sb)->l2bsize;
+#ifdef _STILL_TO_PORT
+        bmSetXD(mp, nxaddr, xsize);
+#endif /* _STILL_TO_PORT */
+        /* unpin the relocated page */
+        DT_PUTPAGE(mp);
+        jfs_info("dtRelocate: target dtpage relocated.");
+        /* the moved extent is dtpage, then a LOG_NOREDOPAGE log rec
+         * needs to be written (in logredo(), the LOG_NOREDOPAGE log rec
+         * will also force a bmap update ).
+         */
+        /*
+         *      3. acquire maplock for the source extent to be freed;
+         */
+        /* for dtpage relocation, write a LOG_NOREDOPAGE record
+         * for the source dtpage (logredo() will init NoRedoPage
+         * filter and will also update bmap for free of the source
+         * dtpage), and upadte bmap for free of the source dtpage;
+         */
+        tlck = txMaplock(tid, ip, tlckDTREE | tlckFREE);
+        pxdlock = (struct pxd_lock *) & tlck->lock;
+        pxdlock->flag = mlckFREEPXD;
+        PXDaddress(&pxdlock->pxd, oxaddr);
+        PXDlength(&pxdlock->pxd, xlen);
+        pxdlock->index = 1;
+        /*
+         *      4. update the parent router entry for relocation;
+         *
+         * acquire tlck for the parent entry covering the target dtpage;
+         * write LOG_REDOPAGE to apply after image only;
+         */
+        jfs_info("dtRelocate: update parent router entry.");
+        tlck = txLock(tid, ip, pmp, tlckDTREE | tlckENTRY);
+        dtlck = (struct dt_lock *) & tlck->lock;
+        lv = & dtlck->lv[dtlck->index];
+        /* update the PXD with the new address */
+        stbl = DT_GETSTBL(pp);
+        pxd = (pxd_t *) & pp->slot[stbl[index]];
+        PXDaddress(pxd, nxaddr);
+        lv->offset = stbl[index];
+        lv->length = 1;
+        dtlck->index++;
+        /* unpin the parent dtpage */
+        DT_PUTPAGE(pmp);
+        return rc;
+}
+/*
+ * NAME:        dtSearchNode()
+ *
+ * FUNCTION:    Search for an dtpage containing a specified address
+ *              This function is mainly used by defragfs utility.
+ *
+ * NOTE:        Search result on stack, the found page is pinned at exit.
+ *              The result page must be an internal dtpage.
+ *              lmxaddr give the address of the left most page of the
+ *              dtree level, in which the required dtpage resides.
+ */
+static int dtSearchNode(struct inode *ip, s64 lmxaddr, pxd_t * kpxd,
+                        struct btstack * btstack)
+{
+        int rc = 0;
+        s64 bn;
+        struct metapage *mp;
+        dtpage_t *p;
+        int psize = 288;        /* initial in-line directory */
+        s8 *stbl;
+        int i;
+        pxd_t *pxd;
+        struct btframe *btsp;
+        BT_CLR(btstack);        /* reset stack */
+        /*
+         *      descend tree to the level with specified leftmost page
+         *
+         *  by convention, root bn = 0.
+         */
+        for (bn = 0;;) {
+                /* get/pin the page to search */
+                DT_GETPAGE(ip, bn, mp, psize, p, rc);
+                if (rc)
+                        return rc;
+                /* does the xaddr of leftmost page of the levevl
+                 * matches levevl search key ?
+                 */
+                if (p->header.flag & BT_ROOT) {
+                        if (lmxaddr == 0)
+                                break;
+                } else if (addressPXD(&p->header.self) == lmxaddr)
+                        break;
+                /*
+                 * descend down to leftmost child page
+                 */
+                if (p->header.flag & BT_LEAF) {
+                        DT_PUTPAGE(mp);
+                        return -ESTALE;
+                }
+                /* get the leftmost entry */
+                stbl = DT_GETSTBL(p);
+                pxd = (pxd_t *) & p->slot[stbl[0]];
+                /* get the child page block address */
+                bn = addressPXD(pxd);
+                psize = lengthPXD(pxd) << JFS_SBI(ip->i_sb)->l2bsize;
+                /* unpin the parent page */
+                DT_PUTPAGE(mp);
+        }
+        /*
+         *      search each page at the current levevl
+         */
+      loop:
+        stbl = DT_GETSTBL(p);
+        for (i = 0; i < p->header.nextindex; i++) {
+                pxd = (pxd_t *) & p->slot[stbl[i]];
+                /* found the specified router entry */
+                if (addressPXD(pxd) == addressPXD(kpxd) &&
+                    lengthPXD(pxd) == lengthPXD(kpxd)) {
+                        btsp = btstack->top;
+                        btsp->bn = bn;
+                        btsp->index = i;
+                        btsp->mp = mp;
+                        return 0;
+                }
+        }
+        /* get the right sibling page if any */
+        if (p->header.next)
+                bn = le64_to_cpu(p->header.next);
+        else {
+                DT_PUTPAGE(mp);
+                return -ESTALE;
+        }
+        /* unpin current page */
+        DT_PUTPAGE(mp);
+        /* get the right sibling page */
+        DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+        if (rc)
+                return rc;
+        goto loop;
+}
+#endif /* _NOTYET */
+/*
+ *      dtRelink()
+ *
+ * function:
+ *      link around a freed page.
+ *
+ * parameter:
+ *      fp:     page to be freed
+ *
+ * return:
+ */
+static int dtRelink(tid_t tid, struct inode *ip, dtpage_t * p)
+{
+        int rc;
+        struct metapage *mp;
+        s64 nextbn, prevbn;
+        struct tlock *tlck;
+        struct dt_lock *dtlck;
+        struct lv *lv;
+        nextbn = le64_to_cpu(p->header.next);
+        prevbn = le64_to_cpu(p->header.prev);
+        /* update prev pointer of the next page */
+        if (nextbn != 0) {
+                DT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc);
+                if (rc)
+                        return rc;
+                BT_MARK_DIRTY(mp, ip);
+                /*
+                 * acquire a transaction lock on the next page
+                 *
+                 * action: update prev pointer;
+                 */
+                tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK);
+                jfs_info("dtRelink nextbn: tlck = 0x%p, ip = 0x%p, mp=0x%p",
+                        tlck, ip, mp);
+                dtlck = (struct dt_lock *) & tlck->lock;
+                /* linelock header */
+                if (dtlck->index >= dtlck->maxcnt)
+                        dtlck = (struct dt_lock *) txLinelock(dtlck);
+                lv = & dtlck->lv[dtlck->index];
+                lv->offset = 0;
+                lv->length = 1;
+                dtlck->index++;
+                p->header.prev = cpu_to_le64(prevbn);
+                DT_PUTPAGE(mp);
+        }
+        /* update next pointer of the previous page */
+        if (prevbn != 0) {
+                DT_GETPAGE(ip, prevbn, mp, PSIZE, p, rc);
+                if (rc)
+                        return rc;
+                BT_MARK_DIRTY(mp, ip);
+                /*
+                 * acquire a transaction lock on the prev page
+                 *
+                 * action: update next pointer;
+                 */
+                tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK);
+                jfs_info("dtRelink prevbn: tlck = 0x%p, ip = 0x%p, mp=0x%p",
+                        tlck, ip, mp);
+                dtlck = (struct dt_lock *) & tlck->lock;
+                /* linelock header */
+                if (dtlck->index >= dtlck->maxcnt)
+                        dtlck = (struct dt_lock *) txLinelock(dtlck);
+                lv = & dtlck->lv[dtlck->index];
+                lv->offset = 0;
+                lv->length = 1;
+                dtlck->index++;
+                p->header.next = cpu_to_le64(nextbn);
+                DT_PUTPAGE(mp);
+        }
+        return 0;
+}
+/*
+ *      dtInitRoot()
+ *
+ * initialize directory root (inline in inode)
+ */
+void dtInitRoot(tid_t tid, struct inode *ip, u32 idotdot)
+{
+        struct jfs_inode_info *jfs_ip = JFS_IP(ip);
+        dtroot_t *p;
+        int fsi;
+        struct dtslot *f;
+        struct tlock *tlck;
+        struct dt_lock *dtlck;
+        struct lv *lv;
+        u16 xflag_save;
+        /*
+         * If this was previously an non-empty directory, we need to remove
+         * the old directory table.
+         */
+        if (DO_INDEX(ip)) {
+                if (!jfs_dirtable_inline(ip)) {
+                        struct tblock *tblk = tid_to_tblock(tid);
+                        /*
+                         * We're playing games with the tid's xflag.  If
+                         * we're removing a regular file, the file's xtree
+                         * is committed with COMMIT_PMAP, but we always
+                         * commit the directories xtree with COMMIT_PWMAP.
+                         */
+                        xflag_save = tblk->xflag;
+                        tblk->xflag = 0;
+                        /*
+                         * xtTruncate isn't guaranteed to fully truncate
+                         * the xtree.  The caller needs to check i_size
+                         * after committing the transaction to see if
+                         * additional truncation is needed.  The
+                         * COMMIT_Stale flag tells caller that we
+                         * initiated the truncation.
+                         */
+                        xtTruncate(tid, ip, 0, COMMIT_PWMAP);
+                        set_cflag(COMMIT_Stale, ip);
+                        tblk->xflag = xflag_save;
+                } else
+                        ip->i_size = 1;
+                jfs_ip->next_index = 2;
+        } else
+                ip->i_size = IDATASIZE;
+        /*
+         * acquire a transaction lock on the root
+         *
+         * action: directory initialization;
+         */
+        tlck = txLock(tid, ip, (struct metapage *) & jfs_ip->bxflag,
+                      tlckDTREE | tlckENTRY | tlckBTROOT);
+        dtlck = (struct dt_lock *) & tlck->lock;
+        /* linelock root */
+        ASSERT(dtlck->index == 0);
+        lv = & dtlck->lv[0];
+        lv->offset = 0;
+        lv->length = DTROOTMAXSLOT;
+        dtlck->index++;
+        p = &jfs_ip->i_dtroot;
+        p->header.flag = DXD_INDEX | BT_ROOT | BT_LEAF;
+        p->header.nextindex = 0;
+        /* init freelist */
+        fsi = 1;
+        f = &p->slot[fsi];
+        /* init data area of root */
+        for (fsi++; fsi < DTROOTMAXSLOT; f++, fsi++)
+                f->next = fsi;
+        f->next = -1;
+        p->header.freelist = 1;
+        p->header.freecnt = 8;
+        /* init '..' entry */
+        p->header.idotdot = cpu_to_le32(idotdot);
+        return;
+}
+/*
+ *      add_missing_indices()
+ *
+ * function: Fix dtree page in which one or more entries has an invalid index.
+ *           fsck.jfs should really fix this, but it currently does not.
+ *           Called from jfs_readdir when bad index is detected.
+ */
+static void add_missing_indices(struct inode *inode, s64 bn)
+{
+        struct ldtentry *d;
+        struct dt_lock *dtlck;
+        int i;
+        uint index;
+        struct lv *lv;
+        struct metapage *mp;
+        dtpage_t *p;
+        int rc;
+        s8 *stbl;
+        tid_t tid;
+        struct tlock *tlck;
+        tid = txBegin(inode->i_sb, 0);
+        DT_GETPAGE(inode, bn, mp, PSIZE, p, rc);
+        if (rc) {
+                printk(KERN_ERR "DT_GETPAGE failed!\n");
+                goto end;
+        }
+        BT_MARK_DIRTY(mp, inode);
+        ASSERT(p->header.flag & BT_LEAF);
+        tlck = txLock(tid, inode, mp, tlckDTREE | tlckENTRY);
+        dtlck = (struct dt_lock *) &tlck->lock;
+        stbl = DT_GETSTBL(p);
+        for (i = 0; i < p->header.nextindex; i++) {
+                d = (struct ldtentry *) &p->slot[stbl[i]];
+                index = le32_to_cpu(d->index);
+                if ((index < 2) || (index >= JFS_IP(inode)->next_index)) {
+                        d->index = cpu_to_le32(add_index(tid, inode, bn, i));
+                        if (dtlck->index >= dtlck->maxcnt)
+                                dtlck = (struct dt_lock *) txLinelock(dtlck);
+                        lv = &dtlck->lv[dtlck->index];
+                        lv->offset = stbl[i];
+                        lv->length = 1;
+                        dtlck->index++;
+                }
+        }
+        DT_PUTPAGE(mp);
+        (void) txCommit(tid, 1, &inode, 0);
+end:
+        txEnd(tid);
+}
+/*
+ * Buffer to hold directory entry info while traversing a dtree page
+ * before being fed to the filldir function
+ */
+struct jfs_dirent {
+        loff_t position;
+        int ino;
+        u16 name_len;
+        char name[0];
+};
+/*
+ * function to determine next variable-sized jfs_dirent in buffer
+ */
+static inline struct jfs_dirent *next_jfs_dirent(struct jfs_dirent *dirent)
+{
+        return (struct jfs_dirent *)
+                ((char *)dirent +
+                 ((sizeof (struct jfs_dirent) + dirent->name_len + 1 +
+                   sizeof (loff_t) - 1) &
+                  ~(sizeof (loff_t) - 1)));
+}
+/*
+ *      jfs_readdir()
+ *
+ * function: read directory entries sequentially
+ *      from the specified entry offset
+ *
+ * parameter:
+ *
+ * return: offset = (pn, index) of start entry
+ *      of next jfs_readdir()/dtRead()
+ */
+int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+        struct inode *ip = filp->f_dentry->d_inode;
+        struct nls_table *codepage = JFS_SBI(ip->i_sb)->nls_tab;
+        int rc = 0;
+        loff_t dtpos;   /* legacy OS/2 style position */
+        struct dtoffset {
+                s16 pn;
+                s16 index;
+                s32 unused;
+        } *dtoffset = (struct dtoffset *) &dtpos;
+        s64 bn;
+        struct metapage *mp;
+        dtpage_t *p;
+        int index;
+        s8 *stbl;
+        struct btstack btstack;
+        int i, next;
+        struct ldtentry *d;
+        struct dtslot *t;
+        int d_namleft, len, outlen;
+        unsigned long dirent_buf;
+        char *name_ptr;
+        u32 dir_index;
+        int do_index = 0;
+        uint loop_count = 0;
+        struct jfs_dirent *jfs_dirent;
+        int jfs_dirents;
+        int overflow, fix_page, page_fixed = 0;
+        static int unique_pos = 2;      /* If we can't fix broken index */
+        if (filp->f_pos == DIREND)
+                return 0;
+        if (DO_INDEX(ip)) {
+                /*
+                 * persistent index is stored in directory entries.
+                 * Special cases:        0 = .
+                 *                       1 = ..
+                 *                      -1 = End of directory
+                 */
+                do_index = 1;
+                dir_index = (u32) filp->f_pos;
+                if (dir_index > 1) {
+                        struct dir_table_slot dirtab_slot;
+                        if (dtEmpty(ip) ||
+                            (dir_index >= JFS_IP(ip)->next_index)) {
+                                /* Stale position.  Directory has shrunk */
+                                filp->f_pos = DIREND;
+                                return 0;
+                        }
+                      repeat:
+                        rc = read_index(ip, dir_index, &dirtab_slot);
+                        if (rc) {
+                                filp->f_pos = DIREND;
+                                return rc;
+                        }
+                        if (dirtab_slot.flag == DIR_INDEX_FREE) {
+                                if (loop_count++ > JFS_IP(ip)->next_index) {
+                                        jfs_err("jfs_readdir detected "
+                                                   "infinite loop!");
+                                        filp->f_pos = DIREND;
+                                        return 0;
+                                }
+                                dir_index = le32_to_cpu(dirtab_slot.addr2);
+                                if (dir_index == -1) {
+                                        filp->f_pos = DIREND;
+                                        return 0;
+                                }
+                                goto repeat;
+                        }
+                        bn = addressDTS(&dirtab_slot);
+                        index = dirtab_slot.slot;
+                        DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+                        if (rc) {
+                                filp->f_pos = DIREND;
+                                return 0;
+                        }
+                        if (p->header.flag & BT_INTERNAL) {
+                                jfs_err("jfs_readdir: bad index table");
+                                DT_PUTPAGE(mp);
+                                filp->f_pos = -1;
+                                return 0;
+                        }
+                } else {
+                        if (dir_index == 0) {
+                                /*
+                                 * self "."
+                                 */
+                                filp->f_pos = 0;
+                                if (filldir(dirent, ".", 1, 0, ip->i_ino,
+                                            DT_DIR))
+                                        return 0;
+                        }
+                        /*
+                         * parent ".."
+                         */
+                        filp->f_pos = 1;
+                        if (filldir(dirent, "..", 2, 1, PARENT(ip), DT_DIR))
+                                return 0;
+                        /*
+                         * Find first entry of left-most leaf
+                         */
+                        if (dtEmpty(ip)) {
+                                filp->f_pos = DIREND;
+                                return 0;
+                        }
+                        if ((rc = dtReadFirst(ip, &btstack)))
+                                return rc;
+                        DT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
+                }
+        } else {
+                /*
+                 * Legacy filesystem - OS/2 & Linux JFS < 0.3.6
+                 *
+                 * pn = index = 0:      First entry "."
+                 * pn = 0; index = 1:   Second entry ".."
+                 * pn > 0:              Real entries, pn=1 -> leftmost page
+                 * pn = index = -1:     No more entries
+                 */
+                dtpos = filp->f_pos;
+                if (dtpos == 0) {
+                        /* build "." entry */
+                        if (filldir(dirent, ".", 1, filp->f_pos, ip->i_ino,
+                                    DT_DIR))
+                                return 0;
+                        dtoffset->index = 1;
+                        filp->f_pos = dtpos;
+                }
+                if (dtoffset->pn == 0) {
+                        if (dtoffset->index == 1) {
+                                /* build ".." entry */
+                                if (filldir(dirent, "..", 2, filp->f_pos,
+                                            PARENT(ip), DT_DIR))
+                                        return 0;
+                        } else {
+                                jfs_err("jfs_readdir called with "
+                                        "invalid offset!");
+                        }
+                        dtoffset->pn = 1;
+                        dtoffset->index = 0;
+                        filp->f_pos = dtpos;
+                }
+                if (dtEmpty(ip)) {
+                        filp->f_pos = DIREND;
+                        return 0;
+                }
+                if ((rc = dtReadNext(ip, &filp->f_pos, &btstack))) {
+                        jfs_err("jfs_readdir: unexpected rc = %d "
+                                "from dtReadNext", rc);
+                        filp->f_pos = DIREND;
+                        return 0;
+                }
+                /* get start leaf page and index */
+                DT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
+                /* offset beyond directory eof ? */
+                if (bn < 0) {
+                        filp->f_pos = DIREND;
+                        return 0;
+                }
+        }
+        dirent_buf = __get_free_page(GFP_KERNEL);
+        if (dirent_buf == 0) {
+                DT_PUTPAGE(mp);
+                jfs_warn("jfs_readdir: __get_free_page failed!");
+                filp->f_pos = DIREND;
+                return -ENOMEM;
+        }
+        while (1) {
+                jfs_dirent = (struct jfs_dirent *) dirent_buf;
+                jfs_dirents = 0;
+                overflow = fix_page = 0;
+                stbl = DT_GETSTBL(p);
+                for (i = index; i < p->header.nextindex; i++) {
+                        d = (struct ldtentry *) & p->slot[stbl[i]];
+                        if (((long) jfs_dirent + d->namlen + 1) >
+                            (dirent_buf + PSIZE)) {
+                                /* DBCS codepages could overrun dirent_buf */
+                                index = i;
+                                overflow = 1;
+                                break;
+                        }
+                        d_namleft = d->namlen;
+                        name_ptr = jfs_dirent->name;
+                        jfs_dirent->ino = le32_to_cpu(d->inumber);
+                        if (do_index) {
+                                len = min(d_namleft, DTLHDRDATALEN);
+                                jfs_dirent->position = le32_to_cpu(d->index);
+                                /*
+                                 * d->index should always be valid, but it
+                                 * isn't.  fsck.jfs doesn't create the
+                                 * directory index for the lost+found
+                                 * directory.  Rather than let it go,
+                                 * we can try to fix it.
+                                 */
+                                if ((jfs_dirent->position < 2) ||
+                                    (jfs_dirent->position >=
+                                     JFS_IP(ip)->next_index)) {
+                                        if (!page_fixed && !isReadOnly(ip)) {
+                                                fix_page = 1;
+                                                /*
+                                                 * setting overflow and setting
+                                                 * index to i will cause the
+                                                 * same page to be processed
+                                                 * again starting here
+                                                 */
+                                                overflow = 1;
+                                                index = i;
+                                                break;
+                                        }
+                                        jfs_dirent->position = unique_pos++;
+                                }
+                        } else {
+                                jfs_dirent->position = dtpos;
+                                len = min(d_namleft, DTLHDRDATALEN_LEGACY);
+                        }
+                        /* copy the name of head/only segment */
+                        outlen = jfs_strfromUCS_le(name_ptr, d->name, len,
+                                                   codepage);
+                        jfs_dirent->name_len = outlen;
+                        /* copy name in the additional segment(s) */
+                        next = d->next;
+                        while (next >= 0) {
+                                t = (struct dtslot *) & p->slot[next];
+                                name_ptr += outlen;
+                                d_namleft -= len;
+                                /* Sanity Check */
+                                if (d_namleft == 0) {
+                                        jfs_error(ip->i_sb,
+                                                  "JFS:Dtree error: ino = "
+                                                  "%ld, bn=%Ld, index = %d",
+                                                  (long)ip->i_ino,
+                                                  (long long)bn,
+                                                  i);
+                                        goto skip_one;
+                                }
+                                len = min(d_namleft, DTSLOTDATALEN);
+                                outlen = jfs_strfromUCS_le(name_ptr, t->name,
+                                                           len, codepage);
+                                jfs_dirent->name_len += outlen;
+                                next = t->next;
+                        }
+                        jfs_dirents++;
+                        jfs_dirent = next_jfs_dirent(jfs_dirent);
+skip_one:
+                        if (!do_index)
+                                dtoffset->index++;
+                }
+                if (!overflow) {
+                        /* Point to next leaf page */
+                        if (p->header.flag & BT_ROOT)
+                                bn = 0;
+                        else {
+                                bn = le64_to_cpu(p->header.next);
+                                index = 0;
+                                /* update offset (pn:index) for new page */
+                                if (!do_index) {
+                                        dtoffset->pn++;
+                                        dtoffset->index = 0;
+                                }
+                        }
+                        page_fixed = 0;
+                }
+                /* unpin previous leaf page */
+                DT_PUTPAGE(mp);
+                jfs_dirent = (struct jfs_dirent *) dirent_buf;
+                while (jfs_dirents--) {
+                        filp->f_pos = jfs_dirent->position;
+                        if (filldir(dirent, jfs_dirent->name,
+                                    jfs_dirent->name_len, filp->f_pos,
+                                    jfs_dirent->ino, DT_UNKNOWN))
+                                goto out;
+                        jfs_dirent = next_jfs_dirent(jfs_dirent);
+                }
+                if (fix_page) {
+                        add_missing_indices(ip, bn);
+                        page_fixed = 1;
+                }
+                if (!overflow && (bn == 0)) {
+                        filp->f_pos = DIREND;
+                        break;
+                }
+                DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+                if (rc) {
+                        free_page(dirent_buf);
+                        return rc;
+                }
+        }
+      out:
+        free_page(dirent_buf);
+        return rc;
+}
+/*
+ *      dtReadFirst()
+ *
+ * function: get the leftmost page of the directory
+ */
+static int dtReadFirst(struct inode *ip, struct btstack * btstack)
+{
+        int rc = 0;
+        s64 bn;
+        int psize = 288;        /* initial in-line directory */
+        struct metapage *mp;
+        dtpage_t *p;
+        s8 *stbl;
+        struct btframe *btsp;
+        pxd_t *xd;
+        BT_CLR(btstack);        /* reset stack */
+        /*
+         *      descend leftmost path of the tree
+         *
+         * by convention, root bn = 0.
+         */
+        for (bn = 0;;) {
+                DT_GETPAGE(ip, bn, mp, psize, p, rc);
+                if (rc)
+                        return rc;
+                /*
+                 * leftmost leaf page
+                 */
+                if (p->header.flag & BT_LEAF) {
+                        /* return leftmost entry */
+                        btsp = btstack->top;
+                        btsp->bn = bn;
+                        btsp->index = 0;
+                        btsp->mp = mp;
+                        return 0;
+                }
+                /*
+                 * descend down to leftmost child page
+                 */
+                if (BT_STACK_FULL(btstack)) {
+                        DT_PUTPAGE(mp);
+                        jfs_error(ip->i_sb, "dtReadFirst: btstack overrun");
+                        BT_STACK_DUMP(btstack);
+                        return -EIO;
+                }
+                /* push (bn, index) of the parent page/entry */
+                BT_PUSH(btstack, bn, 0);
+                /* get the leftmost entry */
+                stbl = DT_GETSTBL(p);
+                xd = (pxd_t *) & p->slot[stbl[0]];
+                /* get the child page block address */
+                bn = addressPXD(xd);
+                psize = lengthPXD(xd) << JFS_SBI(ip->i_sb)->l2bsize;
+                /* unpin the parent page */
+                DT_PUTPAGE(mp);
+        }
+}
+/*
+ *      dtReadNext()
+ *
+ * function: get the page of the specified offset (pn:index)
+ *
+ * return: if (offset > eof), bn = -1;
+ *
+ * note: if index > nextindex of the target leaf page,
+ * start with 1st entry of next leaf page;
+ */
+static int dtReadNext(struct inode *ip, loff_t * offset,
+                      struct btstack * btstack)
+{
+        int rc = 0;
+        struct dtoffset {
+                s16 pn;
+                s16 index;
+                s32 unused;
+        } *dtoffset = (struct dtoffset *) offset;
+        s64 bn;
+        struct metapage *mp;
+        dtpage_t *p;
+        int index;
+        int pn;
+        s8 *stbl;
+        struct btframe *btsp, *parent;
+        pxd_t *xd;
+        /*
+         * get leftmost leaf page pinned
+         */
+        if ((rc = dtReadFirst(ip, btstack)))
+                return rc;
+        /* get leaf page */
+        DT_GETSEARCH(ip, btstack->top, bn, mp, p, index);
+        /* get the start offset (pn:index) */
+        pn = dtoffset->pn - 1;  /* Now pn = 0 represents leftmost leaf */
+        index = dtoffset->index;
+        /* start at leftmost page ? */
+        if (pn == 0) {
+                /* offset beyond eof ? */
+                if (index < p->header.nextindex)
+                        goto out;
+                if (p->header.flag & BT_ROOT) {
+                        bn = -1;
+                        goto out;
+                }
+                /* start with 1st entry of next leaf page */
+                dtoffset->pn++;
+                dtoffset->index = index = 0;
+                goto a;
+        }
+        /* start at non-leftmost page: scan parent pages for large pn */
+        if (p->header.flag & BT_ROOT) {
+                bn = -1;
+                goto out;
+        }
+        /* start after next leaf page ? */
+        if (pn > 1)
+                goto b;
+        /* get leaf page pn = 1 */
+      a:
+        bn = le64_to_cpu(p->header.next);
+        /* unpin leaf page */
+        DT_PUTPAGE(mp);
+        /* offset beyond eof ? */
+        if (bn == 0) {
+                bn = -1;
+                goto out;
+        }
+        goto c;
+        /*
+         * scan last internal page level to get target leaf page
+         */
+      b:
+        /* unpin leftmost leaf page */
+        DT_PUTPAGE(mp);
+        /* get left most parent page */
+        btsp = btstack->top;
+        parent = btsp - 1;
+        bn = parent->bn;
+        DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+        if (rc)
+                return rc;
+        /* scan parent pages at last internal page level */
+        while (pn >= p->header.nextindex) {
+                pn -= p->header.nextindex;
+                /* get next parent page address */
+                bn = le64_to_cpu(p->header.next);
+                /* unpin current parent page */
+                DT_PUTPAGE(mp);
+                /* offset beyond eof ? */
+                if (bn == 0) {
+                        bn = -1;
+                        goto out;
+                }
+                /* get next parent page */
+                DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+                if (rc)
+                        return rc;
+                /* update parent page stack frame */
+                parent->bn = bn;
+        }
+        /* get leaf page address */
+        stbl = DT_GETSTBL(p);
+        xd = (pxd_t *) & p->slot[stbl[pn]];
+        bn = addressPXD(xd);
+        /* unpin parent page */
+        DT_PUTPAGE(mp);
+        /*
+         * get target leaf page
+         */
+      c:
+        DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+        if (rc)
+                return rc;
+        /*
+         * leaf page has been completed:
+         * start with 1st entry of next leaf page
+         */
+        if (index >= p->header.nextindex) {
+                bn = le64_to_cpu(p->header.next);
+                /* unpin leaf page */
+                DT_PUTPAGE(mp);
+                /* offset beyond eof ? */
+                if (bn == 0) {
+                        bn = -1;
+                        goto out;
+                }
+                /* get next leaf page */
+                DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+                if (rc)
+                        return rc;
+                /* start with 1st entry of next leaf page */
+                dtoffset->pn++;
+                dtoffset->index = 0;
+        }
+      out:
+        /* return target leaf page pinned */
+        btsp = btstack->top;
+        btsp->bn = bn;
+        btsp->index = dtoffset->index;
+        btsp->mp = mp;
+        return 0;
+}
+/*
+ *      dtCompare()
+ *
+ * function: compare search key with an internal entry
+ *
+ * return:
+ *      < 0 if k is < record
+ *      = 0 if k is = record
+ *      > 0 if k is > record
+ */
+static int dtCompare(struct component_name * key,       /* search key */
+                     dtpage_t * p,      /* directory page */
+                     int si)
+{                               /* entry slot index */
+        wchar_t *kname;
+        __le16 *name;
+        int klen, namlen, len, rc;
+        struct idtentry *ih;
+        struct dtslot *t;
+        /*
+         * force the left-most key on internal pages, at any level of
+         * the tree, to be less than any search key.
+         * this obviates having to update the leftmost key on an internal
+         * page when the user inserts a new key in the tree smaller than
+         * anything that has been stored.
+         *
+         * (? if/when dtSearch() narrows down to 1st entry (index = 0),
+         * at any internal page at any level of the tree,
+         * it descends to child of the entry anyway -
+         * ? make the entry as min size dummy entry)
+         *
+         * if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & BT_LEAF))
+         * return (1);
+         */
+        kname = key->name;
+        klen = key->namlen;
+        ih = (struct idtentry *) & p->slot[si];
+        si = ih->next;
+        name = ih->name;
+        namlen = ih->namlen;
+        len = min(namlen, DTIHDRDATALEN);
+        /* compare with head/only segment */
+        len = min(klen, len);
+        if ((rc = UniStrncmp_le(kname, name, len)))
+                return rc;
+        klen -= len;
+        namlen -= len;
+        /* compare with additional segment(s) */
+        kname += len;
+        while (klen > 0 && namlen > 0) {
+                /* compare with next name segment */
+                t = (struct dtslot *) & p->slot[si];
+                len = min(namlen, DTSLOTDATALEN);
+                len = min(klen, len);
+                name = t->name;
+                if ((rc = UniStrncmp_le(kname, name, len)))
+                        return rc;
+                klen -= len;
+                namlen -= len;
+                kname += len;
+                si = t->next;
+        }
+        return (klen - namlen);
+}
+/*
+ *      ciCompare()
+ *
+ * function: compare search key with an (leaf/internal) entry
+ *
+ * return:
+ *      < 0 if k is < record
+ *      = 0 if k is = record
+ *      > 0 if k is > record
+ */
+static int ciCompare(struct component_name * key,       /* search key */
+                     dtpage_t * p,      /* directory page */
+                     int si,    /* entry slot index */
+                     int flag)
+{
+        wchar_t *kname, x;
+        __le16 *name;
+        int klen, namlen, len, rc;
+        struct ldtentry *lh;
+        struct idtentry *ih;
+        struct dtslot *t;
+        int i;
+        /*
+         * force the left-most key on internal pages, at any level of
+         * the tree, to be less than any search key.
+         * this obviates having to update the leftmost key on an internal
+         * page when the user inserts a new key in the tree smaller than
+         * anything that has been stored.
+         *
+         * (? if/when dtSearch() narrows down to 1st entry (index = 0),
+         * at any internal page at any level of the tree,
+         * it descends to child of the entry anyway -
+         * ? make the entry as min size dummy entry)
+         *
+         * if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & BT_LEAF))
+         * return (1);
+         */
+        kname = key->name;
+        klen = key->namlen;
+        /*
+         * leaf page entry
+         */
+        if (p->header.flag & BT_LEAF) {
+                lh = (struct ldtentry *) & p->slot[si];
+                si = lh->next;
+                name = lh->name;
+                namlen = lh->namlen;
+                if (flag & JFS_DIR_INDEX)
+                        len = min(namlen, DTLHDRDATALEN);
+                else
+                        len = min(namlen, DTLHDRDATALEN_LEGACY);
+        }
+        /*
+         * internal page entry
+         */
+        else {
+                ih = (struct idtentry *) & p->slot[si];
+                si = ih->next;
+                name = ih->name;
+                namlen = ih->namlen;
+                len = min(namlen, DTIHDRDATALEN);
+        }
+        /* compare with head/only segment */
+        len = min(klen, len);
+        for (i = 0; i < len; i++, kname++, name++) {
+                /* only uppercase if case-insensitive support is on */
+                if ((flag & JFS_OS2) == JFS_OS2)
+                        x = UniToupper(le16_to_cpu(*name));
+                else
+                        x = le16_to_cpu(*name);
+                if ((rc = *kname - x))
+                        return rc;
+        }
+        klen -= len;
+        namlen -= len;
+        /* compare with additional segment(s) */
+        while (klen > 0 && namlen > 0) {
+                /* compare with next name segment */
+                t = (struct dtslot *) & p->slot[si];
+                len = min(namlen, DTSLOTDATALEN);
+                len = min(klen, len);
+                name = t->name;
+                for (i = 0; i < len; i++, kname++, name++) {
+                        /* only uppercase if case-insensitive support is on */
+                        if ((flag & JFS_OS2) == JFS_OS2)
+                                x = UniToupper(le16_to_cpu(*name));
+                        else
+                                x = le16_to_cpu(*name);
+                        if ((rc = *kname - x))
+                                return rc;
+                }
+                klen -= len;
+                namlen -= len;
+                si = t->next;
+        }
+        return (klen - namlen);
+}
+/*
+ *      ciGetLeafPrefixKey()
+ *
+ * function: compute prefix of suffix compression
+ *           from two adjacent leaf entries
+ *           across page boundary
+ *
+ * return: non-zero on error
+ *      
+ */
+static int ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp,
+                               int ri, struct component_name * key, int flag)
+{
+        int klen, namlen;
+        wchar_t *pl, *pr, *kname;
+        struct component_name lkey;
+        struct component_name rkey;
+        lkey.name = (wchar_t *) kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t),
+                                        GFP_KERNEL);
+        if (lkey.name == NULL)
+                return -ENOSPC;
+        rkey.name = (wchar_t *) kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t),
+                                        GFP_KERNEL);
+        if (rkey.name == NULL) {
+                kfree(lkey.name);
+                return -ENOSPC;
+        }
+        /* get left and right key */
+        dtGetKey(lp, li, &lkey, flag);
+        lkey.name[lkey.namlen] = 0;
+        if ((flag & JFS_OS2) == JFS_OS2)
+                ciToUpper(&lkey);
+        dtGetKey(rp, ri, &rkey, flag);
+        rkey.name[rkey.namlen] = 0;
+        if ((flag & JFS_OS2) == JFS_OS2)
+                ciToUpper(&rkey);
+        /* compute prefix */
+        klen = 0;
+        kname = key->name;
+        namlen = min(lkey.namlen, rkey.namlen);
+        for (pl = lkey.name, pr = rkey.name;
+             namlen; pl++, pr++, namlen--, klen++, kname++) {
+                *kname = *pr;
+                if (*pl != *pr) {
+                        key->namlen = klen + 1;
+                        goto free_names;
+                }
+        }
+        /* l->namlen <= r->namlen since l <= r */
+        if (lkey.namlen < rkey.namlen) {
+                *kname = *pr;
+                key->namlen = klen + 1;
+        } else                  /* l->namelen == r->namelen */
+                key->namlen = klen;
+free_names:
+        kfree(lkey.name);
+        kfree(rkey.name);
+        return 0;
+}
+/*
+ *      dtGetKey()
+ *
+ * function: get key of the entry
+ */
+static void dtGetKey(dtpage_t * p, int i,       /* entry index */
+                     struct component_name * key, int flag)
+{
+        int si;
+        s8 *stbl;
+        struct ldtentry *lh;
+        struct idtentry *ih;
+        struct dtslot *t;
+        int namlen, len;
+        wchar_t *kname;
+        __le16 *name;
+        /* get entry */
+        stbl = DT_GETSTBL(p);
+        si = stbl[i];
+        if (p->header.flag & BT_LEAF) {
+                lh = (struct ldtentry *) & p->slot[si];
+                si = lh->next;
+                namlen = lh->namlen;
+                name = lh->name;
+                if (flag & JFS_DIR_INDEX)
+                        len = min(namlen, DTLHDRDATALEN);
+                else
+                        len = min(namlen, DTLHDRDATALEN_LEGACY);
+        } else {
+                ih = (struct idtentry *) & p->slot[si];
+                si = ih->next;
+                namlen = ih->namlen;
+                name = ih->name;
+                len = min(namlen, DTIHDRDATALEN);
+        }
+        key->namlen = namlen;
+        kname = key->name;
+        /*
+         * move head/only segment
+         */
+        UniStrncpy_from_le(kname, name, len);
+        /*
+         * move additional segment(s)
+         */
+        while (si >= 0) {
+                /* get next segment */
+                t = &p->slot[si];
+                kname += len;
+                namlen -= len;
+                len = min(namlen, DTSLOTDATALEN);
+                UniStrncpy_from_le(kname, t->name, len);
+                si = t->next;
+        }
+}
+/*
+ *      dtInsertEntry()
+ *
+ * function: allocate free slot(s) and
+ *           write a leaf/internal entry
+ *
+ * return: entry slot index
+ */
+static void dtInsertEntry(dtpage_t * p, int index, struct component_name * key,
+                          ddata_t * data, struct dt_lock ** dtlock)
+{
+        struct dtslot *h, *t;
+        struct ldtentry *lh = NULL;
+        struct idtentry *ih = NULL;
+        int hsi, fsi, klen, len, nextindex;
+        wchar_t *kname;
+        __le16 *name;
+        s8 *stbl;
+        pxd_t *xd;
+        struct dt_lock *dtlck = *dtlock;
+        struct lv *lv;
+        int xsi, n;
+        s64 bn = 0;
+        struct metapage *mp = NULL;
+        klen = key->namlen;
+        kname = key->name;
+        /* allocate a free slot */
+        hsi = fsi = p->header.freelist;
+        h = &p->slot[fsi];
+        p->header.freelist = h->next;
+        --p->header.freecnt;
+        /* open new linelock */
+        if (dtlck->index >= dtlck->maxcnt)
+                dtlck = (struct dt_lock *) txLinelock(dtlck);
+        lv = & dtlck->lv[dtlck->index];
+        lv->offset = hsi;
+        /* write head/only segment */
+        if (p->header.flag & BT_LEAF) {
+                lh = (struct ldtentry *) h;
+                lh->next = h->next;
+                lh->inumber = cpu_to_le32(data->leaf.ino);
+                lh->namlen = klen;
+                name = lh->name;
+                if (data->leaf.ip) {
+                        len = min(klen, DTLHDRDATALEN);
+                        if (!(p->header.flag & BT_ROOT))
+                                bn = addressPXD(&p->header.self);
+                        lh->index = cpu_to_le32(add_index(data->leaf.tid,
+                                                          data->leaf.ip,
+                                                          bn, index));
+                } else
+                        len = min(klen, DTLHDRDATALEN_LEGACY);
+        } else {
+                ih = (struct idtentry *) h;
+                ih->next = h->next;
+                xd = (pxd_t *) ih;
+                *xd = data->xd;
+                ih->namlen = klen;
+                name = ih->name;
+                len = min(klen, DTIHDRDATALEN);
+        }
+        UniStrncpy_to_le(name, kname, len);
+        n = 1;
+        xsi = hsi;
+        /* write additional segment(s) */
+        t = h;
+        klen -= len;
+        while (klen) {
+                /* get free slot */
+                fsi = p->header.freelist;
+                t = &p->slot[fsi];
+                p->header.freelist = t->next;
+                --p->header.freecnt;
+                /* is next slot contiguous ? */
+                if (fsi != xsi + 1) {
+                        /* close current linelock */
+                        lv->length = n;
+                        dtlck->index++;
+                        /* open new linelock */
+                        if (dtlck->index < dtlck->maxcnt)
+                                lv++;
+                        else {
+                                dtlck = (struct dt_lock *) txLinelock(dtlck);
+                                lv = & dtlck->lv[0];
+                        }
+                        lv->offset = fsi;
+                        n = 0;
+                }
+                kname += len;
+                len = min(klen, DTSLOTDATALEN);
+                UniStrncpy_to_le(t->name, kname, len);
+                n++;
+                xsi = fsi;
+                klen -= len;
+        }
+        /* close current linelock */
+        lv->length = n;
+        dtlck->index++;
+        *dtlock = dtlck;
+        /* terminate last/only segment */
+        if (h == t) {
+                /* single segment entry */
+                if (p->header.flag & BT_LEAF)
+                        lh->next = -1;
+                else
+                        ih->next = -1;
+        } else
+                /* multi-segment entry */
+                t->next = -1;
+        /* if insert into middle, shift right succeeding entries in stbl */
+        stbl = DT_GETSTBL(p);
+        nextindex = p->header.nextindex;
+        if (index < nextindex) {
+                memmove(stbl + index + 1, stbl + index, nextindex - index);
+                if ((p->header.flag & BT_LEAF) && data->leaf.ip) {
+                        s64 lblock;
+                        /*
+                         * Need to update slot number for entries that moved
+                         * in the stbl
+                         */
+                        mp = NULL;
+                        for (n = index + 1; n <= nextindex; n++) {
+                                lh = (struct ldtentry *) & (p->slot[stbl[n]]);
+                                modify_index(data->leaf.tid, data->leaf.ip,
+                                             le32_to_cpu(lh->index), bn, n,
+                                             &mp, &lblock);
+                        }
+                        if (mp)
+                                release_metapage(mp);
+                }
+        }
+        stbl[index] = hsi;
+        /* advance next available entry index of stbl */
+        ++p->header.nextindex;
+}
+/*
+ *      dtMoveEntry()
+ *
+ * function: move entries from split/left page to new/right page
+ *
+ *      nextindex of dst page and freelist/freecnt of both pages
+ *      are updated.
+ */
+static void dtMoveEntry(dtpage_t * sp, int si, dtpage_t * dp,
+                        struct dt_lock ** sdtlock, struct dt_lock ** ddtlock,
+                        int do_index)
+{
+        int ssi, next;          /* src slot index */
+        int di;                 /* dst entry index */
+        int dsi;                /* dst slot index */
+        s8 *sstbl, *dstbl;      /* sorted entry table */
+        int snamlen, len;
+        struct ldtentry *slh, *dlh = NULL;
+        struct idtentry *sih, *dih = NULL;
+        struct dtslot *h, *s, *d;
+        struct dt_lock *sdtlck = *sdtlock, *ddtlck = *ddtlock;
+        struct lv *slv, *dlv;
+        int xssi, ns, nd;
+        int sfsi;
+        sstbl = (s8 *) & sp->slot[sp->header.stblindex];
+        dstbl = (s8 *) & dp->slot[dp->header.stblindex];
+        dsi = dp->header.freelist;      /* first (whole page) free slot */
+        sfsi = sp->header.freelist;
+        /* linelock destination entry slot */
+        dlv = & ddtlck->lv[ddtlck->index];
+        dlv->offset = dsi;
+        /* linelock source entry slot */
+        slv = & sdtlck->lv[sdtlck->index];
+        slv->offset = sstbl[si];
+        xssi = slv->offset - 1;
+        /*
+         * move entries
+         */
+        ns = nd = 0;
+        for (di = 0; si < sp->header.nextindex; si++, di++) {
+                ssi = sstbl[si];
+                dstbl[di] = dsi;
+                /* is next slot contiguous ? */
+                if (ssi != xssi + 1) {
+                        /* close current linelock */
+                        slv->length = ns;
+                        sdtlck->index++;
+                        /* open new linelock */
+                        if (sdtlck->index < sdtlck->maxcnt)
+                                slv++;
+                        else {
+                                sdtlck = (struct dt_lock *) txLinelock(sdtlck);
+                                slv = & sdtlck->lv[0];
+                        }
+                        slv->offset = ssi;
+                        ns = 0;
+                }
+                /*
+                 * move head/only segment of an entry
+                 */
+                /* get dst slot */
+                h = d = &dp->slot[dsi];
+                /* get src slot and move */
+                s = &sp->slot[ssi];
+                if (sp->header.flag & BT_LEAF) {
+                        /* get source entry */
+                        slh = (struct ldtentry *) s;
+                        dlh = (struct ldtentry *) h;
+                        snamlen = slh->namlen;
+                        if (do_index) {
+                                len = min(snamlen, DTLHDRDATALEN);
+                                dlh->index = slh->index; /* little-endian */
+                        } else
+                                len = min(snamlen, DTLHDRDATALEN_LEGACY);
+                        memcpy(dlh, slh, 6 + len * 2);
+                        next = slh->next;
+                        /* update dst head/only segment next field */
+                        dsi++;
+                        dlh->next = dsi;
+                } else {
+                        sih = (struct idtentry *) s;
+                        snamlen = sih->namlen;
+                        len = min(snamlen, DTIHDRDATALEN);
+                        dih = (struct idtentry *) h;
+                        memcpy(dih, sih, 10 + len * 2);
+                        next = sih->next;
+                        dsi++;
+                        dih->next = dsi;
+                }
+                /* free src head/only segment */
+                s->next = sfsi;
+                s->cnt = 1;
+                sfsi = ssi;
+                ns++;
+                nd++;
+                xssi = ssi;
+                /*
+                 * move additional segment(s) of the entry
+                 */
+                snamlen -= len;
+                while ((ssi = next) >= 0) {
+                        /* is next slot contiguous ? */
+                        if (ssi != xssi + 1) {
+                                /* close current linelock */
+                                slv->length = ns;
+                                sdtlck->index++;
+                                /* open new linelock */
+                                if (sdtlck->index < sdtlck->maxcnt)
+                                        slv++;
+                                else {
+                                        sdtlck =
+                                            (struct dt_lock *)
+                                            txLinelock(sdtlck);
+                                        slv = & sdtlck->lv[0];
+                                }
+                                slv->offset = ssi;
+                                ns = 0;
+                        }
+                        /* get next source segment */
+                        s = &sp->slot[ssi];
+                        /* get next destination free slot */
+                        d++;
+                        len = min(snamlen, DTSLOTDATALEN);
+                        UniStrncpy_le(d->name, s->name, len);
+                        ns++;
+                        nd++;
+                        xssi = ssi;
+                        dsi++;
+                        d->next = dsi;
+                        /* free source segment */
+                        next = s->next;
+                        s->next = sfsi;
+                        s->cnt = 1;
+                        sfsi = ssi;
+                        snamlen -= len;
+                }               /* end while */
+                /* terminate dst last/only segment */
+                if (h == d) {
+                        /* single segment entry */
+                        if (dp->header.flag & BT_LEAF)
+                                dlh->next = -1;
+                        else
+                                dih->next = -1;
+                } else
+                        /* multi-segment entry */
+                        d->next = -1;
+        }                       /* end for */
+        /* close current linelock */
+        slv->length = ns;
+        sdtlck->index++;
+        *sdtlock = sdtlck;
+        dlv->length = nd;
+        ddtlck->index++;
+        *ddtlock = ddtlck;
+        /* update source header */
+        sp->header.freelist = sfsi;
+        sp->header.freecnt += nd;
+        /* update destination header */
+        dp->header.nextindex = di;
+        dp->header.freelist = dsi;
+        dp->header.freecnt -= nd;
+}
+/*
+ *      dtDeleteEntry()
+ *
+ * function: free a (leaf/internal) entry
+ *
+ * log freelist header, stbl, and each segment slot of entry
+ * (even though last/only segment next field is modified,
+ * physical image logging requires all segment slots of
+ * the entry logged to avoid applying previous updates
+ * to the same slots)
+ */
+static void dtDeleteEntry(dtpage_t * p, int fi, struct dt_lock ** dtlock)
+{
+        int fsi;                /* free entry slot index */
+        s8 *stbl;
+        struct dtslot *t;
+        int si, freecnt;
+        struct dt_lock *dtlck = *dtlock;
+        struct lv *lv;
+        int xsi, n;
+        /* get free entry slot index */
+        stbl = DT_GETSTBL(p);
+        fsi = stbl[fi];
+        /* open new linelock */
+        if (dtlck->index >= dtlck->maxcnt)
+                dtlck = (struct dt_lock *) txLinelock(dtlck);
+        lv = & dtlck->lv[dtlck->index];
+        lv->offset = fsi;
+        /* get the head/only segment */
+        t = &p->slot[fsi];
+        if (p->header.flag & BT_LEAF)
+                si = ((struct ldtentry *) t)->next;
+        else
+                si = ((struct idtentry *) t)->next;
+        t->next = si;
+        t->cnt = 1;
+        n = freecnt = 1;
+        xsi = fsi;
+        /* find the last/only segment */
+        while (si >= 0) {
+                /* is next slot contiguous ? */
+                if (si != xsi + 1) {
+                        /* close current linelock */
+                        lv->length = n;
+                        dtlck->index++;
+                        /* open new linelock */
+                        if (dtlck->index < dtlck->maxcnt)
+                                lv++;
+                        else {
+                                dtlck = (struct dt_lock *) txLinelock(dtlck);
+                                lv = & dtlck->lv[0];
+                        }
+                        lv->offset = si;
+                        n = 0;
+                }
+                n++;
+                xsi = si;
+                freecnt++;
+                t = &p->slot[si];
+                t->cnt = 1;
+                si = t->next;
+        }
+        /* close current linelock */
+        lv->length = n;
+        dtlck->index++;
+        *dtlock = dtlck;
+        /* update freelist */
+        t->next = p->header.freelist;
+        p->header.freelist = fsi;
+        p->header.freecnt += freecnt;
+        /* if delete from middle,
+         * shift left the succedding entries in the stbl
+         */
+        si = p->header.nextindex;
+        if (fi < si - 1)
+                memmove(&stbl[fi], &stbl[fi + 1], si - fi - 1);
+        p->header.nextindex--;
+}
+/*
+ *      dtTruncateEntry()
+ *
+ * function: truncate a (leaf/internal) entry
+ *
+ * log freelist header, stbl, and each segment slot of entry
+ * (even though last/only segment next field is modified,
+ * physical image logging requires all segment slots of
+ * the entry logged to avoid applying previous updates
+ * to the same slots)
+ */
+static void dtTruncateEntry(dtpage_t * p, int ti, struct dt_lock ** dtlock)
+{
+        int tsi;                /* truncate entry slot index */
+        s8 *stbl;
+        struct dtslot *t;
+        int si, freecnt;
+        struct dt_lock *dtlck = *dtlock;
+        struct lv *lv;
+        int fsi, xsi, n;
+        /* get free entry slot index */
+        stbl = DT_GETSTBL(p);
+        tsi = stbl[ti];
+        /* open new linelock */
+        if (dtlck->index >= dtlck->maxcnt)
+                dtlck = (struct dt_lock *) txLinelock(dtlck);
+        lv = & dtlck->lv[dtlck->index];
+        lv->offset = tsi;
+        /* get the head/only segment */
+        t = &p->slot[tsi];
+        ASSERT(p->header.flag & BT_INTERNAL);
+        ((struct idtentry *) t)->namlen = 0;
+        si = ((struct idtentry *) t)->next;
+        ((struct idtentry *) t)->next = -1;
+        n = 1;
+        freecnt = 0;
+        fsi = si;
+        xsi = tsi;
+        /* find the last/only segment */
+        while (si >= 0) {
+                /* is next slot contiguous ? */
+                if (si != xsi + 1) {
+                        /* close current linelock */
+                        lv->length = n;
+                        dtlck->index++;
+                        /* open new linelock */
+                        if (dtlck->index < dtlck->maxcnt)
+                                lv++;
+                        else {
+                                dtlck = (struct dt_lock *) txLinelock(dtlck);
+                                lv = & dtlck->lv[0];
+                        }
+                        lv->offset = si;
+                        n = 0;
+                }
+                n++;
+                xsi = si;
+                freecnt++;
+                t = &p->slot[si];
+                t->cnt = 1;
+                si = t->next;
+        }
+        /* close current linelock */
+        lv->length = n;
+        dtlck->index++;
+        *dtlock = dtlck;
+        /* update freelist */
+        if (freecnt == 0)
+                return;
+        t->next = p->header.freelist;
+        p->header.freelist = fsi;
+        p->header.freecnt += freecnt;
+}
+/*
+ *      dtLinelockFreelist()
+ */
+static void dtLinelockFreelist(dtpage_t * p,    /* directory page */
+                               int m,   /* max slot index */
+                               struct dt_lock ** dtlock)
+{
+        int fsi;                /* free entry slot index */
+        struct dtslot *t;
+        int si;
+        struct dt_lock *dtlck = *dtlock;
+        struct lv *lv;
+        int xsi, n;
+        /* get free entry slot index */
+        fsi = p->header.freelist;
+        /* open new linelock */
+        if (dtlck->index >= dtlck->maxcnt)
+                dtlck = (struct dt_lock *) txLinelock(dtlck);
+        lv = & dtlck->lv[dtlck->index];
+        lv->offset = fsi;
+        n = 1;
+        xsi = fsi;
+        t = &p->slot[fsi];
+        si = t->next;
+        /* find the last/only segment */
+        while (si < m && si >= 0) {
+                /* is next slot contiguous ? */
+                if (si != xsi + 1) {
+                        /* close current linelock */
+                        lv->length = n;
+                        dtlck->index++;
+                        /* open new linelock */
+                        if (dtlck->index < dtlck->maxcnt)
+                                lv++;
+                        else {
+                                dtlck = (struct dt_lock *) txLinelock(dtlck);
+                                lv = & dtlck->lv[0];
+                        }
+                        lv->offset = si;
+                        n = 0;
+                }
+                n++;
+                xsi = si;
+                t = &p->slot[si];
+                si = t->next;
+        }
+        /* close current linelock */
+        lv->length = n;
+        dtlck->index++;
+        *dtlock = dtlck;
+}
+/*
+ * NAME: dtModify
+ *
+ * FUNCTION: Modify the inode number part of a directory entry
+ *
+ * PARAMETERS:
+ *      tid     - Transaction id
+ *      ip      - Inode of parent directory
+ *      key     - Name of entry to be modified
+ *      orig_ino        - Original inode number expected in entry
+ *      new_ino - New inode number to put into entry
+ *      flag    - JFS_RENAME
+ *
+ * RETURNS:
+ *      -ESTALE - If entry found does not match orig_ino passed in
+ *      -ENOENT - If no entry can be found to match key
+ *      0       - If successfully modified entry
+ */
+int dtModify(tid_t tid, struct inode *ip,
+         struct component_name * key, ino_t * orig_ino, ino_t new_ino, int flag)
+{
+        int rc;
+        s64 bn;
+        struct metapage *mp;
+        dtpage_t *p;
+        int index;
+        struct btstack btstack;
+        struct tlock *tlck;
+        struct dt_lock *dtlck;
+        struct lv *lv;
+        s8 *stbl;
+        int entry_si;           /* entry slot index */
+        struct ldtentry *entry;
+        /*
+         *      search for the entry to modify:
+         *
+         * dtSearch() returns (leaf page pinned, index at which to modify).
+         */
+        if ((rc = dtSearch(ip, key, orig_ino, &btstack, flag)))
+                return rc;
+        /* retrieve search result */
+        DT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
+        BT_MARK_DIRTY(mp, ip);
+        /*
+         * acquire a transaction lock on the leaf page of named entry
+         */
+        tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY);
+        dtlck = (struct dt_lock *) & tlck->lock;
+        /* get slot index of the entry */
+        stbl = DT_GETSTBL(p);
+        entry_si = stbl[index];
+        /* linelock entry */
+        ASSERT(dtlck->index == 0);
+        lv = & dtlck->lv[0];
+        lv->offset = entry_si;
+        lv->length = 1;
+        dtlck->index++;
+        /* get the head/only segment */
+        entry = (struct ldtentry *) & p->slot[entry_si];
+        /* substitute the inode number of the entry */
+        entry->inumber = cpu_to_le32(new_ino);
+        /* unpin the leaf page */
+        DT_PUTPAGE(mp);
+        return 0;
+}
+#ifdef _JFS_DEBUG_DTREE
+/*
+ *      dtDisplayTree()
+ *
+ * function: traverse forward
+ */
+int dtDisplayTree(struct inode *ip)
+{
+        int rc;
+        struct metapage *mp;
+        dtpage_t *p;
+        s64 bn, pbn;
+        int index, lastindex, v, h;
+        pxd_t *xd;
+        struct btstack btstack;
+        struct btframe *btsp;
+        struct btframe *parent;
+        u8 *stbl;
+        int psize = 256;
+        printk("display B+-tree.\n");
+        /* clear stack */
+        btsp = btstack.stack;
+        /*
+         * start with root
+         *
+         * root resides in the inode
+         */
+        bn = 0;
+        v = h = 0;
+        /*
+         * first access of each page:
+         */
+      newPage:
+        DT_GETPAGE(ip, bn, mp, psize, p, rc);
+        if (rc)
+                return rc;
+        /* process entries forward from first index */
+        index = 0;
+        lastindex = p->header.nextindex - 1;
+        if (p->header.flag & BT_INTERNAL) {
+                /*
+                 * first access of each internal page
+                 */
+                printf("internal page ");
+                dtDisplayPage(ip, bn, p);
+                goto getChild;
+        } else {                /* (p->header.flag & BT_LEAF) */
+                /*
+                 * first access of each leaf page
+                 */
+                printf("leaf page ");
+                dtDisplayPage(ip, bn, p);
+                /*
+                 * process leaf page entries
+                 *
+                 for ( ; index <= lastindex; index++)
+                 {
+                 }
+                 */
+                /* unpin the leaf page */
+                DT_PUTPAGE(mp);
+        }
+        /*
+         * go back up to the parent page
+         */
+      getParent:
+        /* pop/restore parent entry for the current child page */
+        if ((parent = (btsp == btstack.stack ? NULL : --btsp)) == NULL)
+                /* current page must have been root */
+                return;
+        /*
+         * parent page scan completed
+         */
+        if ((index = parent->index) == (lastindex = parent->lastindex)) {
+                /* go back up to the parent page */
+                goto getParent;
+        }
+        /*
+         * parent page has entries remaining
+         */
+        /* get back the parent page */
+        bn = parent->bn;
+        /* v = parent->level; */
+        DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+        if (rc)
+                return rc;
+        /* get next parent entry */
+        index++;
+        /*
+         * internal page: go down to child page of current entry
+         */
+      getChild:
+        /* push/save current parent entry for the child page */
+        btsp->bn = pbn = bn;
+        btsp->index = index;
+        btsp->lastindex = lastindex;
+        /* btsp->level = v; */
+        /* btsp->node = h; */
+        ++btsp;
+        /* get current entry for the child page */
+        stbl = DT_GETSTBL(p);
+        xd = (pxd_t *) & p->slot[stbl[index]];
+        /*
+         * first access of each internal entry:
+         */
+        /* get child page */
+        bn = addressPXD(xd);
+        psize = lengthPXD(xd) << ip->i_ipmnt->i_l2bsize;
+        printk("traverse down 0x%Lx[%d]->0x%Lx\n", pbn, index, bn);
+        v++;
+        h = index;
+        /* release parent page */
+        DT_PUTPAGE(mp);
+        /* process the child page */
+        goto newPage;
+}
+/*
+ *      dtDisplayPage()
+ *
+ * function: display page
+ */
+int dtDisplayPage(struct inode *ip, s64 bn, dtpage_t * p)
+{
+        int rc;
+        struct metapage *mp;
+        struct ldtentry *lh;
+        struct idtentry *ih;
+        pxd_t *xd;
+        int i, j;
+        u8 *stbl;
+        wchar_t name[JFS_NAME_MAX + 1];
+        struct component_name key = { 0, name };
+        int freepage = 0;
+        if (p == NULL) {
+                freepage = 1;
+                DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+                if (rc)
+                        return rc;
+        }
+        /* display page control */
+        printk("bn:0x%Lx flag:0x%08x nextindex:%d\n",
+               bn, p->header.flag, p->header.nextindex);
+        /* display entries */
+        stbl = DT_GETSTBL(p);
+        for (i = 0, j = 1; i < p->header.nextindex; i++, j++) {
+                dtGetKey(p, i, &key, JFS_SBI(ip->i_sb)->mntflag);
+                key.name[key.namlen] = '\0';
+                if (p->header.flag & BT_LEAF) {
+                        lh = (struct ldtentry *) & p->slot[stbl[i]];
+                        printf("\t[%d] %s:%d", i, key.name,
+                               le32_to_cpu(lh->inumber));
+                } else {
+                        ih = (struct idtentry *) & p->slot[stbl[i]];
+                        xd = (pxd_t *) ih;
+                        bn = addressPXD(xd);
+                        printf("\t[%d] %s:0x%Lx", i, key.name, bn);
+                }
+                if (j == 4) {
+                        printf("\n");
+                        j = 0;
+                }
+        }
+        printf("\n");
+        if (freepage)
+                DT_PUTPAGE(mp);
+        return 0;
+}
+#endif                          /* _JFS_DEBUG_DTREE */
diff --git a/fs/jfs/jfs_dtree.h b/fs/jfs/jfs_dtree.h
new file mode 100644
index 000000000000..273a80130c9d
--- /dev/null
+++ b/fs/jfs/jfs_dtree.h
@@ -0,0 +1,279 @@
+/*
+ *   Copyright (c) International Business Machines Corp., 2000-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_DTREE
+#define _H_JFS_DTREE
+/*
+ *      jfs_dtree.h: directory B+-tree manager
+ */
+#include "jfs_btree.h"
+typedef union {
+        struct {
+                tid_t tid;
+                struct inode *ip;
+                u32 ino;
+        } leaf;
+        pxd_t xd;
+} ddata_t;
+/*
+ *      entry segment/slot
+ *
+ * an entry consists of type dependent head/only segment/slot and
+ * additional segments/slots linked vi next field;
+ * N.B. last/only segment of entry is terminated by next = -1;
+ */
+/*
+ *      directory page slot
+ */
+struct dtslot {
+        s8 next;                /* 1: */
+        s8 cnt;                 /* 1: */
+        __le16 name[15];        /* 30: */
+};                              /* (32) */
+#define DATASLOTSIZE    16
+#define L2DATASLOTSIZE  4
+#define DTSLOTSIZE      32
+#define L2DTSLOTSIZE    5
+#define DTSLOTHDRSIZE   2
+#define DTSLOTDATASIZE  30
+#define DTSLOTDATALEN   15
+/*
+ *       internal node entry head/only segment
+ */
+struct idtentry {
+        pxd_t xd;               /* 8: child extent descriptor */
+        s8 next;                /* 1: */
+        u8 namlen;              /* 1: */
+        __le16 name[11];        /* 22: 2-byte aligned */
+};                              /* (32) */
+#define DTIHDRSIZE      10
+#define DTIHDRDATALEN   11
+/* compute number of slots for entry */
+#define NDTINTERNAL(klen) ( ((4 + (klen)) + (15 - 1)) / 15 )
+/*
+ *      leaf node entry head/only segment
+ *
+ *      For legacy filesystems, name contains 13 wchars -- no index field
+ */
+struct ldtentry {
+        __le32 inumber;         /* 4: 4-byte aligned */
+        s8 next;                /* 1: */
+        u8 namlen;              /* 1: */
+        __le16 name[11];        /* 22: 2-byte aligned */
+        __le32 index;           /* 4: index into dir_table */
+};                              /* (32) */
+#define DTLHDRSIZE      6
+#define DTLHDRDATALEN_LEGACY    13      /* Old (OS/2) format */
+#define DTLHDRDATALEN   11
+/*
+ * dir_table used for directory traversal during readdir
+ */
+/*
+ * Keep persistent index for directory entries
+ */
+#define DO_INDEX(INODE) (JFS_SBI((INODE)->i_sb)->mntflag & JFS_DIR_INDEX)
+/*
+ * Maximum entry in inline directory table
+ */
+#define MAX_INLINE_DIRTABLE_ENTRY 13
+struct dir_table_slot {
+        u8 rsrvd;               /* 1: */
+        u8 flag;                /* 1: 0 if free */
+        u8 slot;                /* 1: slot within leaf page of entry */
+        u8 addr1;               /* 1: upper 8 bits of leaf page address */
+        __le32 addr2;           /* 4: lower 32 bits of leaf page address -OR-
+                                   index of next entry when this entry was deleted */
+};                              /* (8) */
+/*
+ * flag values
+ */
+#define DIR_INDEX_VALID 1
+#define DIR_INDEX_FREE 0
+#define DTSaddress(dir_table_slot, address64)\
+{\
+        (dir_table_slot)->addr1 = ((u64)address64) >> 32;\
+        (dir_table_slot)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\
+}
+#define addressDTS(dts)\
+        ( ((s64)((dts)->addr1)) << 32 | __le32_to_cpu((dts)->addr2) )
+/* compute number of slots for entry */
+#define NDTLEAF_LEGACY(klen)    ( ((2 + (klen)) + (15 - 1)) / 15 )
+#define NDTLEAF NDTINTERNAL
+/*
+ *      directory root page (in-line in on-disk inode):
+ *
+ * cf. dtpage_t below.
+ */
+typedef union {
+        struct {
+                struct dasd DASD; /* 16: DASD limit/usage info */
+                u8 flag;        /* 1: */
+                u8 nextindex;   /* 1: next free entry in stbl */
+                s8 freecnt;     /* 1: free count */
+                s8 freelist;    /* 1: freelist header */
+                __le32 idotdot; /* 4: parent inode number */
+                s8 stbl[8];     /* 8: sorted entry index table */
+        } header;               /* (32) */
+        struct dtslot slot[9];
+} dtroot_t;
+#define PARENT(IP) \
+        (le32_to_cpu(JFS_IP(IP)->i_dtroot.header.idotdot))
+#define DTROOTMAXSLOT   9
+#define dtEmpty(IP) (JFS_IP(IP)->i_dtroot.header.nextindex == 0)
+/*
+ *      directory regular page:
+ *
+ *      entry slot array of 32 byte slot
+ *
+ * sorted entry slot index table (stbl):
+ * contiguous slots at slot specified by stblindex,
+ * 1-byte per entry
+ *   512 byte block:  16 entry tbl (1 slot)
+ *  1024 byte block:  32 entry tbl (1 slot)
+ *  2048 byte block:  64 entry tbl (2 slot)
+ *  4096 byte block: 128 entry tbl (4 slot)
+ *
+ * data area:
+ *   512 byte block:  16 - 2 =  14 slot
+ *  1024 byte block:  32 - 2 =  30 slot
+ *  2048 byte block:  64 - 3 =  61 slot
+ *  4096 byte block: 128 - 5 = 123 slot
+ *
+ * N.B. index is 0-based; index fields refer to slot index
+ * except nextindex which refers to entry index in stbl;
+ * end of entry stot list or freelist is marked with -1.
+ */
+typedef union {
+        struct {
+                __le64 next;    /* 8: next sibling */
+                __le64 prev;    /* 8: previous sibling */
+                u8 flag;        /* 1: */
+                u8 nextindex;   /* 1: next entry index in stbl */
+                s8 freecnt;     /* 1: */
+                s8 freelist;    /* 1: slot index of head of freelist */
+                u8 maxslot;     /* 1: number of slots in page slot[] */
+                u8 stblindex;   /* 1: slot index of start of stbl */
+                u8 rsrvd[2];    /* 2: */
+                pxd_t self;     /* 8: self pxd */
+        } header;               /* (32) */
+        struct dtslot slot[128];
+} dtpage_t;
+#define DTPAGEMAXSLOT        128
+#define DT8THPGNODEBYTES     512
+#define DT8THPGNODETSLOTS      1
+#define DT8THPGNODESLOTS      16
+#define DTQTRPGNODEBYTES    1024
+#define DTQTRPGNODETSLOTS      1
+#define DTQTRPGNODESLOTS      32
+#define DTHALFPGNODEBYTES   2048
+#define DTHALFPGNODETSLOTS     2
+#define DTHALFPGNODESLOTS     64
+#define DTFULLPGNODEBYTES   4096
+#define DTFULLPGNODETSLOTS     4
+#define DTFULLPGNODESLOTS    128
+#define DTENTRYSTART    1
+/* get sorted entry table of the page */
+#define DT_GETSTBL(p) ( ((p)->header.flag & BT_ROOT) ?\
+        ((dtroot_t *)(p))->header.stbl : \
+        (s8 *)&(p)->slot[(p)->header.stblindex] )
+/*
+ * Flags for dtSearch
+ */
+#define JFS_CREATE 1
+#define JFS_LOOKUP 2
+#define JFS_REMOVE 3
+#define JFS_RENAME 4
+#define DIRENTSIZ(namlen) \
+    ( (sizeof(struct dirent) - 2*(JFS_NAME_MAX+1) + 2*((namlen)+1) + 3) &~ 3 )
+/*
+ * Maximum file offset for directories.
+ */
+#define DIREND  INT_MAX
+/*
+ *      external declarations
+ */
+extern void dtInitRoot(tid_t tid, struct inode *ip, u32 idotdot);
+extern int dtSearch(struct inode *ip, struct component_name * key,
+                    ino_t * data, struct btstack * btstack, int flag);
+extern int dtInsert(tid_t tid, struct inode *ip, struct component_name * key,
+                    ino_t * ino, struct btstack * btstack);
+extern int dtDelete(tid_t tid, struct inode *ip, struct component_name * key,
+                    ino_t * data, int flag);
+extern int dtModify(tid_t tid, struct inode *ip, struct component_name * key,
+                    ino_t * orig_ino, ino_t new_ino, int flag);
+extern int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir);
+#ifdef  _JFS_DEBUG_DTREE
+extern int dtDisplayTree(struct inode *ip);
+extern int dtDisplayPage(struct inode *ip, s64 bn, dtpage_t * p);
+#endif                          /* _JFS_DEBUG_DTREE */
+#endif                          /* !_H_JFS_DTREE */
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
new file mode 100644
index 000000000000..1953acb79266
--- /dev/null
+++ b/fs/jfs/jfs_extent.c
@@ -0,0 +1,668 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/fs.h>
+#include <linux/quotaops.h>
+#include "jfs_incore.h"
+#include "jfs_superblock.h"
+#include "jfs_dmap.h"
+#include "jfs_extent.h"
+#include "jfs_debug.h"
+/*
+ * forward references
+ */
+static int extBalloc(struct inode *, s64, s64 *, s64 *);
+#ifdef _NOTYET
+static int extBrealloc(struct inode *, s64, s64, s64 *, s64 *);
+#endif
+static s64 extRoundDown(s64 nb);
+/*
+ * external references
+ */
+extern int jfs_commit_inode(struct inode *, int);
+#define DPD(a)          (printk("(a): %d\n",(a)))
+#define DPC(a)          (printk("(a): %c\n",(a)))
+#define DPL1(a)                                 \
+{                                               \
+        if ((a) >> 32)                          \
+                printk("(a): %x%08x  ",(a));    \
+        else                                    \
+                printk("(a): %x  ",(a) << 32);  \
+}
+#define DPL(a)                                  \
+{                                               \
+        if ((a) >> 32)                          \
+                printk("(a): %x%08x\n",(a));    \
+        else                                    \
+                printk("(a): %x\n",(a) << 32);  \
+}
+#define DPD1(a)         (printk("(a): %d  ",(a)))
+#define DPX(a)          (printk("(a): %08x\n",(a)))
+#define DPX1(a)         (printk("(a): %08x  ",(a)))
+#define DPS(a)          (printk("%s\n",(a)))
+#define DPE(a)          (printk("\nENTERING: %s\n",(a)))
+#define DPE1(a)          (printk("\nENTERING: %s",(a)))
+#define DPS1(a)         (printk("  %s  ",(a)))
+/*
+ * NAME:        extAlloc()
+ *
+ * FUNCTION:    allocate an extent for a specified page range within a
+ *              file.
+ *
+ * PARAMETERS:
+ *      ip      - the inode of the file.
+ *      xlen    - requested extent length.
+ *      pno     - the starting page number with the file.
+ *      xp      - pointer to an xad.  on entry, xad describes an
+ *                extent that is used as an allocation hint if the
+ *                xaddr of the xad is non-zero.  on successful exit,
+ *                the xad describes the newly allocated extent.
+ *      abnr    - boolean_t indicating whether the newly allocated extent
+ *                should be marked as allocated but not recorded.
+ *
+ * RETURN VALUES:
+ *      0       - success
+ *      -EIO    - i/o error.
+ *      -ENOSPC - insufficient disk resources.
+ */
+int
+extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, boolean_t abnr)
+{
+        struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
+        s64 nxlen, nxaddr, xoff, hint, xaddr = 0;
+        int rc;
+        int xflag;
+        /* This blocks if we are low on resources */
+        txBeginAnon(ip->i_sb);
+        /* Avoid race with jfs_commit_inode() */
+        down(&JFS_IP(ip)->commit_sem);
+        /* validate extent length */
+        if (xlen > MAXXLEN)
+                xlen = MAXXLEN;
+        /* get the page's starting extent offset */
+        xoff = pno << sbi->l2nbperpage;
+        /* check if an allocation hint was provided */
+        if ((hint = addressXAD(xp))) {
+                /* get the size of the extent described by the hint */
+                nxlen = lengthXAD(xp);
+                /* check if the hint is for the portion of the file
+                 * immediately previous to the current allocation
+                 * request and if hint extent has the same abnr
+                 * value as the current request.  if so, we can
+                 * extend the hint extent to include the current
+                 * extent if we can allocate the blocks immediately
+                 * following the hint extent.
+                 */
+                if (offsetXAD(xp) + nxlen == xoff &&
+                    abnr == ((xp->flag & XAD_NOTRECORDED) ? TRUE : FALSE))
+                        xaddr = hint + nxlen;
+                /* adjust the hint to the last block of the extent */
+                hint += (nxlen - 1);
+        }
+        /* allocate the disk blocks for the extent.  initially, extBalloc()
+         * will try to allocate disk blocks for the requested size (xlen). 
+         * if this fails (xlen contigious free blocks not avaliable), it'll
+         * try to allocate a smaller number of blocks (producing a smaller
+         * extent), with this smaller number of blocks consisting of the
+         * requested number of blocks rounded down to the next smaller
+         * power of 2 number (i.e. 16 -> 8).  it'll continue to round down
+         * and retry the allocation until the number of blocks to allocate
+         * is smaller than the number of blocks per page.
+         */
+        nxlen = xlen;
+        if ((rc = extBalloc(ip, hint ? hint : INOHINT(ip), &nxlen, &nxaddr))) {
+                up(&JFS_IP(ip)->commit_sem);
+                return (rc);
+        }
+        /* Allocate blocks to quota. */
+        if (DQUOT_ALLOC_BLOCK(ip, nxlen)) {
+                dbFree(ip, nxaddr, (s64) nxlen);
+                up(&JFS_IP(ip)->commit_sem);
+                return -EDQUOT;
+        }
+        /* determine the value of the extent flag */
+        xflag = (abnr == TRUE) ? XAD_NOTRECORDED : 0;
+        /* if we can extend the hint extent to cover the current request, 
+         * extend it.  otherwise, insert a new extent to
+         * cover the current request.
+         */
+        if (xaddr && xaddr == nxaddr)
+                rc = xtExtend(0, ip, xoff, (int) nxlen, 0);
+        else
+                rc = xtInsert(0, ip, xflag, xoff, (int) nxlen, &nxaddr, 0);
+        /* if the extend or insert failed, 
+         * free the newly allocated blocks and return the error.
+         */
+        if (rc) {
+                dbFree(ip, nxaddr, nxlen);
+                DQUOT_FREE_BLOCK(ip, nxlen);
+                up(&JFS_IP(ip)->commit_sem);
+                return (rc);
+        }
+        /* set the results of the extent allocation */
+        XADaddress(xp, nxaddr);
+        XADlength(xp, nxlen);
+        XADoffset(xp, xoff);
+        xp->flag = xflag;
+        mark_inode_dirty(ip);
+        up(&JFS_IP(ip)->commit_sem);
+        /*
+         * COMMIT_SyncList flags an anonymous tlock on page that is on
+         * sync list.
+         * We need to commit the inode to get the page written disk.
+         */
+        if (test_and_clear_cflag(COMMIT_Synclist,ip))
+                jfs_commit_inode(ip, 0);
+        return (0);
+}
+#ifdef _NOTYET
+/*
+ * NAME:        extRealloc()
+ *
+ * FUNCTION:    extend the allocation of a file extent containing a
+ *              partial back last page.
+ *
+ * PARAMETERS:
+ *      ip      - the inode of the file.
+ *      cp      - cbuf for the partial backed last page.
+ *      xlen    - request size of the resulting extent.
+ *      xp      - pointer to an xad. on successful exit, the xad
+ *                describes the newly allocated extent.
+ *      abnr    - boolean_t indicating whether the newly allocated extent
+ *                should be marked as allocated but not recorded.
+ *
+ * RETURN VALUES:
+ *      0       - success
+ *      -EIO    - i/o error.
+ *      -ENOSPC - insufficient disk resources.
+ */
+int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, boolean_t abnr)
+{
+        struct super_block *sb = ip->i_sb;
+        s64 xaddr, xlen, nxaddr, delta, xoff;
+        s64 ntail, nextend, ninsert;
+        int rc, nbperpage = JFS_SBI(sb)->nbperpage;
+        int xflag;
+        /* This blocks if we are low on resources */
+        txBeginAnon(ip->i_sb);
+        down(&JFS_IP(ip)->commit_sem);
+        /* validate extent length */
+        if (nxlen > MAXXLEN)
+                nxlen = MAXXLEN;
+        /* get the extend (partial) page's disk block address and
+         * number of blocks.
+         */
+        xaddr = addressXAD(xp);
+        xlen = lengthXAD(xp);
+        xoff = offsetXAD(xp);
+        /* if the extend page is abnr and if the request is for
+         * the extent to be allocated and recorded, 
+         * make the page allocated and recorded.
+         */
+        if ((xp->flag & XAD_NOTRECORDED) && !abnr) {
+                xp->flag = 0;
+                if ((rc = xtUpdate(0, ip, xp)))
+                        goto exit;
+        }
+        /* try to allocated the request number of blocks for the
+         * extent.  dbRealloc() first tries to satisfy the request
+         * by extending the allocation in place. otherwise, it will
+         * try to allocate a new set of blocks large enough for the
+         * request.  in satisfying a request, dbReAlloc() may allocate
+         * less than what was request but will always allocate enough
+         * space as to satisfy the extend page.
+         */
+        if ((rc = extBrealloc(ip, xaddr, xlen, &nxlen, &nxaddr)))
+                goto exit;
+        /* Allocat blocks to quota. */
+        if (DQUOT_ALLOC_BLOCK(ip, nxlen)) {
+                dbFree(ip, nxaddr, (s64) nxlen);
+                up(&JFS_IP(ip)->commit_sem);
+                return -EDQUOT;
+        }
+        delta = nxlen - xlen;
+        /* check if the extend page is not abnr but the request is abnr
+         * and the allocated disk space is for more than one page.  if this
+         * is the case, there is a miss match of abnr between the extend page
+         * and the one or more pages following the extend page.  as a result,
+         * two extents will have to be manipulated. the first will be that
+         * of the extent of the extend page and will be manipulated thru
+         * an xtExtend() or an xtTailgate(), depending upon whether the
+         * disk allocation occurred as an inplace extension.  the second
+         * extent will be manipulated (created) through an xtInsert() and
+         * will be for the pages following the extend page.
+         */
+        if (abnr && (!(xp->flag & XAD_NOTRECORDED)) && (nxlen > nbperpage)) {
+                ntail = nbperpage;
+                nextend = ntail - xlen;
+                ninsert = nxlen - nbperpage;
+                xflag = XAD_NOTRECORDED;
+        } else {
+                ntail = nxlen;
+                nextend = delta;
+                ninsert = 0;
+                xflag = xp->flag;
+        }
+        /* if we were able to extend the disk allocation in place,
+         * extend the extent.  otherwise, move the extent to a
+         * new disk location.
+         */
+        if (xaddr == nxaddr) {
+                /* extend the extent */
+                if ((rc = xtExtend(0, ip, xoff + xlen, (int) nextend, 0))) {
+                        dbFree(ip, xaddr + xlen, delta);
+                        DQUOT_FREE_BLOCK(ip, nxlen);
+                        goto exit;
+                }
+        } else {
+                /*
+                 * move the extent to a new location:
+                 *
+                 * xtTailgate() accounts for relocated tail extent;
+                 */
+                if ((rc = xtTailgate(0, ip, xoff, (int) ntail, nxaddr, 0))) {
+                        dbFree(ip, nxaddr, nxlen);
+                        DQUOT_FREE_BLOCK(ip, nxlen);
+                        goto exit;
+                }
+        }
+        /* check if we need to also insert a new extent */
+        if (ninsert) {
+                /* perform the insert.  if it fails, free the blocks
+                 * to be inserted and make it appear that we only did
+                 * the xtExtend() or xtTailgate() above.
+                 */
+                xaddr = nxaddr + ntail;
+                if (xtInsert (0, ip, xflag, xoff + ntail, (int) ninsert,
+                              &xaddr, 0)) {
+                        dbFree(ip, xaddr, (s64) ninsert);
+                        delta = nextend;
+                        nxlen = ntail;
+                        xflag = 0;
+                }
+        }
+        /* set the return results */
+        XADaddress(xp, nxaddr);
+        XADlength(xp, nxlen);
+        XADoffset(xp, xoff);
+        xp->flag = xflag;
+        mark_inode_dirty(ip);
+exit:
+        up(&JFS_IP(ip)->commit_sem);
+        return (rc);
+}
+#endif                  /* _NOTYET */
+/*
+ * NAME:        extHint()
+ *
+ * FUNCTION:    produce an extent allocation hint for a file offset.
+ *
+ * PARAMETERS:
+ *      ip      - the inode of the file.
+ *      offset  - file offset for which the hint is needed.
+ *      xp      - pointer to the xad that is to be filled in with
+ *                the hint.
+ *
+ * RETURN VALUES:
+ *      0       - success
+ *      -EIO    - i/o error.
+ */
+int extHint(struct inode *ip, s64 offset, xad_t * xp)
+{
+        struct super_block *sb = ip->i_sb;
+        struct xadlist xadl;
+        struct lxdlist lxdl;
+        lxd_t lxd;
+        s64 prev;
+        int rc, nbperpage = JFS_SBI(sb)->nbperpage;
+        /* init the hint as "no hint provided" */
+        XADaddress(xp, 0);
+        /* determine the starting extent offset of the page previous
+         * to the page containing the offset.
+         */
+        prev = ((offset & ~POFFSET) >> JFS_SBI(sb)->l2bsize) - nbperpage;
+        /* if the offsets in the first page of the file,
+         * no hint provided.
+         */
+        if (prev < 0)
+                return (0);
+        /* prepare to lookup the previous page's extent info */
+        lxdl.maxnlxd = 1;
+        lxdl.nlxd = 1;
+        lxdl.lxd = &lxd;
+        LXDoffset(&lxd, prev)
+            LXDlength(&lxd, nbperpage);
+        xadl.maxnxad = 1;
+        xadl.nxad = 0;
+        xadl.xad = xp;
+        /* perform the lookup */
+        if ((rc = xtLookupList(ip, &lxdl, &xadl, 0)))
+                return (rc);
+        /* check if not extent exists for the previous page.  
+         * this is possible for sparse files.
+         */
+        if (xadl.nxad == 0) {
+//              assert(ISSPARSE(ip));
+                return (0);
+        }
+        /* only preserve the abnr flag within the xad flags
+         * of the returned hint.
+         */
+        xp->flag &= XAD_NOTRECORDED;
+        if(xadl.nxad != 1 || lengthXAD(xp) != nbperpage) {          
+                jfs_error(ip->i_sb, "extHint: corrupt xtree");
+                return -EIO;
+        }
+        return (0);
+}
+/*
+ * NAME:        extRecord()
+ *
+ * FUNCTION:    change a page with a file from not recorded to recorded.
+ *
+ * PARAMETERS:
+ *      ip      - inode of the file.
+ *      cp      - cbuf of the file page.
+ *
+ * RETURN VALUES:
+ *      0       - success
+ *      -EIO    - i/o error.
+ *      -ENOSPC - insufficient disk resources.
+ */
+int extRecord(struct inode *ip, xad_t * xp)
+{
+        int rc;
+        txBeginAnon(ip->i_sb);
+        down(&JFS_IP(ip)->commit_sem);
+        /* update the extent */
+        rc = xtUpdate(0, ip, xp);
+        up(&JFS_IP(ip)->commit_sem);
+        return rc;
+}
+#ifdef _NOTYET
+/*
+ * NAME:        extFill()
+ *
+ * FUNCTION:    allocate disk space for a file page that represents
+ *              a file hole.
+ *
+ * PARAMETERS:
+ *      ip      - the inode of the file.
+ *      cp      - cbuf of the file page represent the hole.
+ *
+ * RETURN VALUES:
+ *      0       - success
+ *      -EIO    - i/o error.
+ *      -ENOSPC - insufficient disk resources.
+ */
+int extFill(struct inode *ip, xad_t * xp)
+{
+        int rc, nbperpage = JFS_SBI(ip->i_sb)->nbperpage;
+        s64 blkno = offsetXAD(xp) >> ip->i_blksize;
+//      assert(ISSPARSE(ip));
+        /* initialize the extent allocation hint */
+        XADaddress(xp, 0);
+        /* allocate an extent to fill the hole */
+        if ((rc = extAlloc(ip, nbperpage, blkno, xp, FALSE)))
+                return (rc);
+        assert(lengthPXD(xp) == nbperpage);
+        return (0);
+}
+#endif                  /* _NOTYET */
+/*
+ * NAME:        extBalloc()
+ *
+ * FUNCTION:    allocate disk blocks to form an extent.
+ *
+ *              initially, we will try to allocate disk blocks for the
+ *              requested size (nblocks).  if this fails (nblocks 
+ *              contigious free blocks not avaliable), we'll try to allocate
+ *              a smaller number of blocks (producing a smaller extent), with
+ *              this smaller number of blocks consisting of the requested
+ *              number of blocks rounded down to the next smaller power of 2
+ *              number (i.e. 16 -> 8).  we'll continue to round down and
+ *              retry the allocation until the number of blocks to allocate
+ *              is smaller than the number of blocks per page.
+ *              
+ * PARAMETERS:
+ *      ip       - the inode of the file.
+ *      hint     - disk block number to be used as an allocation hint.
+ *      *nblocks - pointer to an s64 value.  on entry, this value specifies
+ *                 the desired number of block to be allocated. on successful
+ *                 exit, this value is set to the number of blocks actually
+ *                 allocated.
+ *      blkno    - pointer to a block address that is filled in on successful
+ *                 return with the starting block number of the newly 
+ *                 allocated block range.
+ *
+ * RETURN VALUES:
+ *      0       - success
+ *      -EIO    - i/o error.
+ *      -ENOSPC - insufficient disk resources.
+ */
+static int
+extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
+{
+        struct jfs_inode_info *ji = JFS_IP(ip);
+        struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
+        s64 nb, nblks, daddr, max;
+        int rc, nbperpage = sbi->nbperpage;
+        struct bmap *bmp = sbi->bmap;
+        int ag;
+        /* get the number of blocks to initially attempt to allocate.
+         * we'll first try the number of blocks requested unless this
+         * number is greater than the maximum number of contigious free
+         * blocks in the map. in that case, we'll start off with the 
+         * maximum free.
+         */
+        max = (s64) 1 << bmp->db_maxfreebud;
+        if (*nblocks >= max && *nblocks > nbperpage)
+                nb = nblks = (max > nbperpage) ? max : nbperpage;
+        else
+                nb = nblks = *nblocks;
+        /* try to allocate blocks */
+        while ((rc = dbAlloc(ip, hint, nb, &daddr)) != 0) {
+                /* if something other than an out of space error,
+                 * stop and return this error.
+                 */
+                if (rc != -ENOSPC)
+                        return (rc);
+                /* decrease the allocation request size */
+                nb = min(nblks, extRoundDown(nb));
+                /* give up if we cannot cover a page */
+                if (nb < nbperpage)
+                        return (rc);
+        }
+        *nblocks = nb;
+        *blkno = daddr;
+        if (S_ISREG(ip->i_mode) && (ji->fileset == FILESYSTEM_I)) {
+                ag = BLKTOAG(daddr, sbi);
+                spin_lock_irq(&ji->ag_lock);
+                if (ji->active_ag == -1) {
+                        atomic_inc(&bmp->db_active[ag]);
+                        ji->active_ag = ag;
+                } else if (ji->active_ag != ag) {
+                        atomic_dec(&bmp->db_active[ji->active_ag]);
+                        atomic_inc(&bmp->db_active[ag]);
+                        ji->active_ag = ag;
+                }
+                spin_unlock_irq(&ji->ag_lock);
+        }
+        return (0);
+}
+#ifdef _NOTYET
+/*
+ * NAME:        extBrealloc()
+ *
+ * FUNCTION:    attempt to extend an extent's allocation.
+ *
+ *              initially, we will try to extend the extent's allocation
+ *              in place.  if this fails, we'll try to move the extent
+ *              to a new set of blocks. if moving the extent, we initially
+ *              will try to allocate disk blocks for the requested size
+ *              (nnew).  if this fails  (nnew contigious free blocks not
+ *              avaliable), we'll try  to allocate a smaller number of
+ *              blocks (producing a smaller extent), with this smaller
+ *              number of blocks consisting of the requested number of
+ *              blocks rounded down to the next smaller power of 2
+ *              number (i.e. 16 -> 8).  we'll continue to round down and
+ *              retry the allocation until the number of blocks to allocate
+ *              is smaller than the number of blocks per page.
+ *              
+ * PARAMETERS:
+ *      ip       - the inode of the file.
+ *      blkno    - starting block number of the extents current allocation.
+ *      nblks    - number of blocks within the extents current allocation.
+ *      newnblks - pointer to a s64 value.  on entry, this value is the
+ *                 the new desired extent size (number of blocks).  on
+ *                 successful exit, this value is set to the extent's actual
+ *                 new size (new number of blocks).
+ *      newblkno - the starting block number of the extents new allocation.
+ *
+ * RETURN VALUES:
+ *      0       - success
+ *      -EIO    - i/o error.
+ *      -ENOSPC - insufficient disk resources.
+ */
+static int
+extBrealloc(struct inode *ip,
+            s64 blkno, s64 nblks, s64 * newnblks, s64 * newblkno)
+{
+        int rc;
+        /* try to extend in place */
+        if ((rc = dbExtend(ip, blkno, nblks, *newnblks - nblks)) == 0) {
+                *newblkno = blkno;
+                return (0);
+        } else {
+                if (rc != -ENOSPC)
+                        return (rc);
+        }
+        /* in place extension not possible.  
+         * try to move the extent to a new set of blocks.
+         */
+        return (extBalloc(ip, blkno, newnblks, newblkno));
+}
+#endif                  /* _NOTYET */
+/*
+ * NAME:        extRoundDown()
+ *
+ * FUNCTION:    round down a specified number of blocks to the next
+ *              smallest power of 2 number.
+ *
+ * PARAMETERS:
+ *      nb      - the inode of the file.
+ *
+ * RETURN VALUES:
+ *      next smallest power of 2 number.
+ */
+static s64 extRoundDown(s64 nb)
+{
+        int i;
+        u64 m, k;
+        for (i = 0, m = (u64) 1 << 63; i < 64; i++, m >>= 1) {
+                if (m & nb)
+                        break;
+        }
+        i = 63 - i;
+        k = (u64) 1 << i;
+        k = ((k - 1) & nb) ? k : k >> 1;
+        return (k);
+}
diff --git a/fs/jfs/jfs_extent.h b/fs/jfs/jfs_extent.h
new file mode 100644
index 000000000000..e80fc7ced87d
--- /dev/null
+++ b/fs/jfs/jfs_extent.h
@@ -0,0 +1,31 @@
+/*
+ *   Copyright (c) International Business Machines Corp., 2000-2001
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_EXTENT
+#define _H_JFS_EXTENT
+/*  get block allocation allocation hint as location of disk inode */
+#define INOHINT(ip)     \
+        (addressPXD(&(JFS_IP(ip)->ixpxd)) + lengthPXD(&(JFS_IP(ip)->ixpxd)) - 1)
+extern int      extAlloc(struct inode *, s64, s64, xad_t *, boolean_t);
+extern int      extFill(struct inode *, xad_t *);
+extern int      extHint(struct inode *, s64, xad_t *);
+extern int      extRealloc(struct inode *, s64, xad_t *, boolean_t);
+extern int      extRecord(struct inode *, xad_t *);
+#endif  /* _H_JFS_EXTENT */
diff --git a/fs/jfs/jfs_filsys.h b/fs/jfs/jfs_filsys.h
new file mode 100644
index 000000000000..86ccac80f0ab
--- /dev/null
+++ b/fs/jfs/jfs_filsys.h
@@ -0,0 +1,280 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2003
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_FILSYS
+#define _H_JFS_FILSYS
+/*
+ *      jfs_filsys.h
+ *
+ * file system (implementation-dependent) constants 
+ *
+ * refer to <limits.h> for system wide implementation-dependent constants 
+ */
+/*
+ *       file system option (superblock flag)
+ */
+/* mount time flag to disable journaling to disk */
+#define JFS_NOINTEGRITY 0x00000010
+/* mount time flags for error handling */
+#define JFS_ERR_REMOUNT_RO 0x00000002   /* remount read-only */
+#define JFS_ERR_CONTINUE   0x00000004   /* continue */
+#define JFS_ERR_PANIC      0x00000008   /* panic */
+/* platform option (conditional compilation) */
+#define JFS_AIX         0x80000000      /* AIX support */
+/*      POSIX name/directory  support */
+#define JFS_OS2         0x40000000      /* OS/2 support */
+/*      case-insensitive name/directory support */
+#define JFS_DFS         0x20000000      /* DCE DFS LFS support */
+#define JFS_LINUX       0x10000000      /* Linux support */
+/*      case-sensitive name/directory support */
+/* directory option */
+#define JFS_UNICODE     0x00000001      /* unicode name */
+/* commit option */
+#define JFS_COMMIT      0x00000f00      /* commit option mask */
+#define JFS_GROUPCOMMIT 0x00000100      /* group (of 1) commit */
+#define JFS_LAZYCOMMIT  0x00000200      /* lazy commit */
+#define JFS_TMPFS       0x00000400      /* temporary file system - 
+                                         * do not log/commit:
+                                         */
+/* log logical volume option */
+#define JFS_INLINELOG   0x00000800      /* inline log within file system */
+#define JFS_INLINEMOVE  0x00001000      /* inline log being moved */
+/* Secondary aggregate inode table */
+#define JFS_BAD_SAIT    0x00010000      /* current secondary ait is bad */
+/* sparse regular file support */
+#define JFS_SPARSE      0x00020000      /* sparse regular file */
+/* DASD Limits          F226941 */
+#define JFS_DASD_ENABLED        0x00040000      /* DASD limits enabled */
+#define JFS_DASD_PRIME          0x00080000      /* Prime DASD usage on boot */
+/* big endian flag */
+#define JFS_SWAP_BYTES          0x00100000      /* running on big endian computer */
+/* Directory index */
+#define JFS_DIR_INDEX           0x00200000      /* Persistant index for */
+                                                /* directory entries    */
+/*
+ *      buffer cache configuration
+ */
+/* page size */
+#ifdef PSIZE
+#undef PSIZE
+#endif
+#define PSIZE           4096    /* page size (in byte) */
+#define L2PSIZE         12      /* log2(PSIZE) */
+#define POFFSET         4095    /* offset within page */
+/* buffer page size */
+#define BPSIZE  PSIZE
+/*
+ *      fs fundamental size
+ *
+ * PSIZE >= file system block size >= PBSIZE >= DISIZE
+ */
+#define PBSIZE          512     /* physical block size (in byte) */
+#define L2PBSIZE        9       /* log2(PBSIZE) */
+#define DISIZE          512     /* on-disk inode size (in byte) */
+#define L2DISIZE        9       /* log2(DISIZE) */
+#define IDATASIZE       256     /* inode inline data size */
+#define IXATTRSIZE      128     /* inode inline extended attribute size */
+#define XTPAGE_SIZE     4096
+#define log2_PAGESIZE     12
+#define IAG_SIZE        4096
+#define IAG_EXTENT_SIZE 4096
+#define INOSPERIAG      4096    /* number of disk inodes per iag */
+#define L2INOSPERIAG    12      /* l2 number of disk inodes per iag */
+#define INOSPEREXT      32      /* number of disk inode per extent */
+#define L2INOSPEREXT    5       /* l2 number of disk inode per extent */
+#define IXSIZE          (DISIZE * INOSPEREXT)   /* inode extent size */
+#define INOSPERPAGE     8       /* number of disk inodes per 4K page */
+#define L2INOSPERPAGE   3       /* log2(INOSPERPAGE) */
+#define IAGFREELIST_LWM 64
+#define INODE_EXTENT_SIZE       IXSIZE  /* inode extent size */
+#define NUM_INODE_PER_EXTENT    INOSPEREXT
+#define NUM_INODE_PER_IAG       INOSPERIAG
+#define MINBLOCKSIZE            512
+#define MAXBLOCKSIZE            4096
+#define MAXFILESIZE             ((s64)1 << 52)
+#define JFS_LINK_MAX            0xffffffff
+/* Minimum number of bytes supported for a JFS partition */
+#define MINJFS                  (0x1000000)
+#define MINJFSTEXT              "16"
+/*
+ * file system block size -> physical block size
+ */
+#define LBOFFSET(x)     ((x) & (PBSIZE - 1))
+#define LBNUMBER(x)     ((x) >> L2PBSIZE)
+#define LBLK2PBLK(sb,b) ((b) << (sb->s_blocksize_bits - L2PBSIZE))
+#define PBLK2LBLK(sb,b) ((b) >> (sb->s_blocksize_bits - L2PBSIZE))
+/* size in byte -> last page number */
+#define SIZE2PN(size)   ( ((s64)((size) - 1)) >> (L2PSIZE) )
+/* size in byte -> last file system block number */
+#define SIZE2BN(size, l2bsize) ( ((s64)((size) - 1)) >> (l2bsize) )
+/*
+ * fixed physical block address (physical block size = 512 byte)
+ *
+ * NOTE: since we can't guarantee a physical block size of 512 bytes the use of
+ *       these macros should be removed and the byte offset macros used instead.
+ */
+#define SUPER1_B        64      /* primary superblock */
+#define AIMAP_B         (SUPER1_B + 8)  /* 1st extent of aggregate inode map */
+#define AITBL_B         (AIMAP_B + 16)  /*
+                                         * 1st extent of aggregate inode table
+                                         */
+#define SUPER2_B        (AITBL_B + 32)  /* 2ndary superblock pbn */
+#define BMAP_B          (SUPER2_B + 8)  /* block allocation map */
+/*
+ * SIZE_OF_SUPER defines the total amount of space reserved on disk for the
+ * superblock.  This is not the same as the superblock structure, since all of
+ * this space is not currently being used.
+ */
+#define SIZE_OF_SUPER   PSIZE
+/*
+ * SIZE_OF_AG_TABLE defines the amount of space reserved to hold the AG table
+ */
+#define SIZE_OF_AG_TABLE        PSIZE
+/*
+ * SIZE_OF_MAP_PAGE defines the amount of disk space reserved for each page of
+ * the inode allocation map (to hold iag)
+ */
+#define SIZE_OF_MAP_PAGE        PSIZE
+/*
+ * fixed byte offset address
+ */
+#define SUPER1_OFF      0x8000  /* primary superblock */
+#define AIMAP_OFF       (SUPER1_OFF + SIZE_OF_SUPER)
+                                        /*
+                                         * Control page of aggregate inode map
+                                         * followed by 1st extent of map
+                                         */
+#define AITBL_OFF       (AIMAP_OFF + (SIZE_OF_MAP_PAGE << 1))
+                                        /* 
+                                         * 1st extent of aggregate inode table
+                                         */
+#define SUPER2_OFF      (AITBL_OFF + INODE_EXTENT_SIZE)
+                                        /*
+                                         * secondary superblock
+                                         */
+#define BMAP_OFF        (SUPER2_OFF + SIZE_OF_SUPER)
+                                        /*
+                                         * block allocation map
+                                         */
+/*
+ * The following macro is used to indicate the number of reserved disk blocks at
+ * the front of an aggregate, in terms of physical blocks.  This value is
+ * currently defined to be 32K.  This turns out to be the same as the primary
+ * superblock's address, since it directly follows the reserved blocks.
+ */
+#define AGGR_RSVD_BLOCKS        SUPER1_B
+/*
+ * The following macro is used to indicate the number of reserved bytes at the
+ * front of an aggregate.  This value is currently defined to be 32K.  This
+ * turns out to be the same as the primary superblock's byte offset, since it
+ * directly follows the reserved blocks.
+ */
+#define AGGR_RSVD_BYTES SUPER1_OFF
+/*
+ * The following macro defines the byte offset for the first inode extent in
+ * the aggregate inode table.  This allows us to find the self inode to find the
+ * rest of the table.  Currently this value is 44K.
+ */
+#define AGGR_INODE_TABLE_START  AITBL_OFF
+/*
+ *      fixed reserved inode number
+ */
+/* aggregate inode */
+#define AGGR_RESERVED_I 0       /* aggregate inode (reserved) */
+#define AGGREGATE_I     1       /* aggregate inode map inode */
+#define BMAP_I          2       /* aggregate block allocation map inode */
+#define LOG_I           3       /* aggregate inline log inode */
+#define BADBLOCK_I      4       /* aggregate bad block inode */
+#define FILESYSTEM_I    16      /* 1st/only fileset inode in ait:
+                                 * fileset inode map inode
+                                 */
+/* per fileset inode */
+#define FILESET_RSVD_I  0       /* fileset inode (reserved) */
+#define FILESET_EXT_I   1       /* fileset inode extension */
+#define ROOT_I          2       /* fileset root inode */
+#define ACL_I           3       /* fileset ACL inode */
+#define FILESET_OBJECT_I 4      /* the first fileset inode available for a file
+                                 * or directory or link...
+                                 */
+#define FIRST_FILESET_INO 16    /* the first aggregate inode which describes
+                                 * an inode.  (To fsck this is also the first
+                                 * inode in part 2 of the agg inode table.)
+                                 */
+/*
+ *      directory configuration
+ */
+#define JFS_NAME_MAX    255
+#define JFS_PATH_MAX    BPSIZE
+/*
+ *      file system state (superblock state)
+ */
+#define FM_CLEAN 0x00000000     /* file system is unmounted and clean */
+#define FM_MOUNT 0x00000001     /* file system is mounted cleanly */
+#define FM_DIRTY 0x00000002     /* file system was not unmounted and clean 
+                                 * when mounted or 
+                                 * commit failure occurred while being mounted:
+                                 * fsck() must be run to repair 
+                                 */
+#define FM_LOGREDO 0x00000004   /* log based recovery (logredo()) failed:
+                                 * fsck() must be run to repair 
+                                 */
+#define FM_EXTENDFS 0x00000008  /* file system extendfs() in progress */
+#endif                          /* _H_JFS_FILSYS */
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
new file mode 100644
index 000000000000..783831301625
--- /dev/null
+++ b/fs/jfs/jfs_imap.c
@@ -0,0 +1,3270 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+/*
+ *      jfs_imap.c: inode allocation map manager
+ *
+ * Serialization:
+ *   Each AG has a simple lock which is used to control the serialization of
+ *      the AG level lists.  This lock should be taken first whenever an AG
+ *      level list will be modified or accessed.
+ *
+ *   Each IAG is locked by obtaining the buffer for the IAG page.
+ *
+ *   There is also a inode lock for the inode map inode.  A read lock needs to
+ *      be taken whenever an IAG is read from the map or the global level
+ *      information is read.  A write lock needs to be taken whenever the global
+ *      level information is modified or an atomic operation needs to be used.
+ *
+ *      If more than one IAG is read at one time, the read lock may not
+ *      be given up until all of the IAG's are read.  Otherwise, a deadlock
+ *      may occur when trying to obtain the read lock while another thread
+ *      holding the read lock is waiting on the IAG already being held.
+ *
+ *   The control page of the inode map is read into memory by diMount().
+ *      Thereafter it should only be modified in memory and then it will be
+ *      written out when the filesystem is unmounted by diUnmount().
+ */
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include "jfs_incore.h"
+#include "jfs_filsys.h"
+#include "jfs_dinode.h"
+#include "jfs_dmap.h"
+#include "jfs_imap.h"
+#include "jfs_metapage.h"
+#include "jfs_superblock.h"
+#include "jfs_debug.h"
+/*
+ * imap locks
+ */
+/* iag free list lock */
+#define IAGFREE_LOCK_INIT(imap)         init_MUTEX(&imap->im_freelock)
+#define IAGFREE_LOCK(imap)              down(&imap->im_freelock)
+#define IAGFREE_UNLOCK(imap)            up(&imap->im_freelock)
+/* per ag iag list locks */
+#define AG_LOCK_INIT(imap,index)        init_MUTEX(&(imap->im_aglock[index]))
+#define AG_LOCK(imap,agno)              down(&imap->im_aglock[agno])
+#define AG_UNLOCK(imap,agno)            up(&imap->im_aglock[agno])
+/*
+ * external references
+ */
+extern struct address_space_operations jfs_aops;
+/*
+ * forward references
+ */
+static int diAllocAG(struct inomap *, int, boolean_t, struct inode *);
+static int diAllocAny(struct inomap *, int, boolean_t, struct inode *);
+static int diAllocBit(struct inomap *, struct iag *, int);
+static int diAllocExt(struct inomap *, int, struct inode *);
+static int diAllocIno(struct inomap *, int, struct inode *);
+static int diFindFree(u32, int);
+static int diNewExt(struct inomap *, struct iag *, int);
+static int diNewIAG(struct inomap *, int *, int, struct metapage **);
+static void duplicateIXtree(struct super_block *, s64, int, s64 *);
+static int diIAGRead(struct inomap * imap, int, struct metapage **);
+static int copy_from_dinode(struct dinode *, struct inode *);
+static void copy_to_dinode(struct dinode *, struct inode *);
+/*
+ *      debug code for double-checking inode map
+ */
+/* #define      _JFS_DEBUG_IMAP 1 */
+#ifdef  _JFS_DEBUG_IMAP
+#define DBG_DIINIT(imap)        DBGdiInit(imap)
+#define DBG_DIALLOC(imap, ino)  DBGdiAlloc(imap, ino)
+#define DBG_DIFREE(imap, ino)   DBGdiFree(imap, ino)
+static void *DBGdiInit(struct inomap * imap);
+static void DBGdiAlloc(struct inomap * imap, ino_t ino);
+static void DBGdiFree(struct inomap * imap, ino_t ino);
+#else
+#define DBG_DIINIT(imap)
+#define DBG_DIALLOC(imap, ino)
+#define DBG_DIFREE(imap, ino)
+#endif                          /* _JFS_DEBUG_IMAP */
+/*
+ * NAME:        diMount()
+ *
+ * FUNCTION:    initialize the incore inode map control structures for
+ *              a fileset or aggregate init time.
+ *
+ *              the inode map's control structure (dinomap) is 
+ *              brought in from disk and placed in virtual memory.
+ *
+ * PARAMETERS:
+ *      ipimap  - pointer to inode map inode for the aggregate or fileset.
+ *
+ * RETURN VALUES:
+ *      0       - success
+ *      -ENOMEM  - insufficient free virtual memory.
+ *      -EIO    - i/o error.
+ */
+int diMount(struct inode *ipimap)
+{
+        struct inomap *imap;
+        struct metapage *mp;
+        int index;
+        struct dinomap_disk *dinom_le;
+        /*
+         * allocate/initialize the in-memory inode map control structure
+         */
+        /* allocate the in-memory inode map control structure. */
+        imap = (struct inomap *) kmalloc(sizeof(struct inomap), GFP_KERNEL);
+        if (imap == NULL) {
+                jfs_err("diMount: kmalloc returned NULL!");
+                return -ENOMEM;
+        }
+        /* read the on-disk inode map control structure. */
+        mp = read_metapage(ipimap,
+                           IMAPBLKNO << JFS_SBI(ipimap->i_sb)->l2nbperpage,
+                           PSIZE, 0);
+        if (mp == NULL) {
+                kfree(imap);
+                return -EIO;
+        }
+        /* copy the on-disk version to the in-memory version. */
+        dinom_le = (struct dinomap_disk *) mp->data;
+        imap->im_freeiag = le32_to_cpu(dinom_le->in_freeiag);
+        imap->im_nextiag = le32_to_cpu(dinom_le->in_nextiag);
+        atomic_set(&imap->im_numinos, le32_to_cpu(dinom_le->in_numinos));
+        atomic_set(&imap->im_numfree, le32_to_cpu(dinom_le->in_numfree));
+        imap->im_nbperiext = le32_to_cpu(dinom_le->in_nbperiext);
+        imap->im_l2nbperiext = le32_to_cpu(dinom_le->in_l2nbperiext);
+        for (index = 0; index < MAXAG; index++) {
+                imap->im_agctl[index].inofree =
+                    le32_to_cpu(dinom_le->in_agctl[index].inofree);
+                imap->im_agctl[index].extfree =
+                    le32_to_cpu(dinom_le->in_agctl[index].extfree);
+                imap->im_agctl[index].numinos =
+                    le32_to_cpu(dinom_le->in_agctl[index].numinos);
+                imap->im_agctl[index].numfree =
+                    le32_to_cpu(dinom_le->in_agctl[index].numfree);
+        }
+        /* release the buffer. */
+        release_metapage(mp);
+        /*
+         * allocate/initialize inode allocation map locks
+         */
+        /* allocate and init iag free list lock */
+        IAGFREE_LOCK_INIT(imap);
+        /* allocate and init ag list locks */
+        for (index = 0; index < MAXAG; index++) {
+                AG_LOCK_INIT(imap, index);
+        }
+        /* bind the inode map inode and inode map control structure
+         * to each other.
+         */
+        imap->im_ipimap = ipimap;
+        JFS_IP(ipimap)->i_imap = imap;
+//      DBG_DIINIT(imap);
+        return (0);
+}
+/*
+ * NAME:        diUnmount()
+ *
+ * FUNCTION:    write to disk the incore inode map control structures for
+ *              a fileset or aggregate at unmount time.
+ *
+ * PARAMETERS:
+ *      ipimap  - pointer to inode map inode for the aggregate or fileset.
+ *
+ * RETURN VALUES:
+ *      0       - success
+ *      -ENOMEM  - insufficient free virtual memory.
+ *      -EIO    - i/o error.
+ */
+int diUnmount(struct inode *ipimap, int mounterror)
+{
+        struct inomap *imap = JFS_IP(ipimap)->i_imap;
+        /*
+         * update the on-disk inode map control structure
+         */
+        if (!(mounterror || isReadOnly(ipimap)))
+                diSync(ipimap);
+        /*
+         * Invalidate the page cache buffers
+         */
+        truncate_inode_pages(ipimap->i_mapping, 0);
+        /*
+         * free in-memory control structure
+         */
+        kfree(imap);
+        return (0);
+}
+/*
+ *      diSync()
+ */
+int diSync(struct inode *ipimap)
+{
+        struct dinomap_disk *dinom_le;
+        struct inomap *imp = JFS_IP(ipimap)->i_imap;
+        struct metapage *mp;
+        int index;
+        /*
+         * write imap global conrol page
+         */
+        /* read the on-disk inode map control structure */
+        mp = get_metapage(ipimap,
+                          IMAPBLKNO << JFS_SBI(ipimap->i_sb)->l2nbperpage,
+                          PSIZE, 0);
+        if (mp == NULL) {
+                jfs_err("diSync: get_metapage failed!");
+                return -EIO;
+        }
+        /* copy the in-memory version to the on-disk version */
+        dinom_le = (struct dinomap_disk *) mp->data;
+        dinom_le->in_freeiag = cpu_to_le32(imp->im_freeiag);
+        dinom_le->in_nextiag = cpu_to_le32(imp->im_nextiag);
+        dinom_le->in_numinos = cpu_to_le32(atomic_read(&imp->im_numinos));
+        dinom_le->in_numfree = cpu_to_le32(atomic_read(&imp->im_numfree));
+        dinom_le->in_nbperiext = cpu_to_le32(imp->im_nbperiext);
+        dinom_le->in_l2nbperiext = cpu_to_le32(imp->im_l2nbperiext);
+        for (index = 0; index < MAXAG; index++) {
+                dinom_le->in_agctl[index].inofree =
+                    cpu_to_le32(imp->im_agctl[index].inofree);
+                dinom_le->in_agctl[index].extfree =
+                    cpu_to_le32(imp->im_agctl[index].extfree);
+                dinom_le->in_agctl[index].numinos =
+                    cpu_to_le32(imp->im_agctl[index].numinos);
+                dinom_le->in_agctl[index].numfree =
+                    cpu_to_le32(imp->im_agctl[index].numfree);
+        }
+        /* write out the control structure */
+        write_metapage(mp);
+        /*
+         * write out dirty pages of imap
+         */
+        filemap_fdatawrite(ipimap->i_mapping);
+        filemap_fdatawait(ipimap->i_mapping);
+        diWriteSpecial(ipimap, 0);
+        return (0);
+}
+/*
+ * NAME:        diRead()
+ *
+ * FUNCTION:    initialize an incore inode from disk.
+ *
+ *              on entry, the specifed incore inode should itself
+ *              specify the disk inode number corresponding to the
+ *              incore inode (i.e. i_number should be initialized).
+ *              
+ *              this routine handles incore inode initialization for
+ *              both "special" and "regular" inodes.  special inodes
+ *              are those required early in the mount process and
+ *              require special handling since much of the file system
+ *              is not yet initialized.  these "special" inodes are
+ *              identified by a NULL inode map inode pointer and are
+ *              actually initialized by a call to diReadSpecial().
+ *              
+ *              for regular inodes, the iag describing the disk inode
+ *              is read from disk to determine the inode extent address
+ *              for the disk inode.  with the inode extent address in
+ *              hand, the page of the extent that contains the disk
+ *              inode is read and the disk inode is copied to the
+ *              incore inode.
+ *
+ * PARAMETERS:
+ *      ip  -  pointer to incore inode to be initialized from disk.
+ *
+ * RETURN VALUES:
+ *      0       - success
+ *      -EIO    - i/o error.
+ *      -ENOMEM - insufficient memory
+ *      
+ */
+int diRead(struct inode *ip)
+{
+        struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
+        int iagno, ino, extno, rc;
+        struct inode *ipimap;
+        struct dinode *dp;
+        struct iag *iagp;
+        struct metapage *mp;
+        s64 blkno, agstart;
+        struct inomap *imap;
+        int block_offset;
+        int inodes_left;
+        uint pageno;
+        int rel_inode;
+        jfs_info("diRead: ino = %ld", ip->i_ino);
+        ipimap = sbi->ipimap;
+        JFS_IP(ip)->ipimap = ipimap;
+        /* determine the iag number for this inode (number) */
+        iagno = INOTOIAG(ip->i_ino);
+        /* read the iag */
+        imap = JFS_IP(ipimap)->i_imap;
+        IREAD_LOCK(ipimap);
+        rc = diIAGRead(imap, iagno, &mp);
+        IREAD_UNLOCK(ipimap);
+        if (rc) {
+                jfs_err("diRead: diIAGRead returned %d", rc);
+                return (rc);
+        }
+        iagp = (struct iag *) mp->data;
+        /* determine inode extent that holds the disk inode */
+        ino = ip->i_ino & (INOSPERIAG - 1);
+        extno = ino >> L2INOSPEREXT;
+        if ((lengthPXD(&iagp->inoext[extno]) != imap->im_nbperiext) ||
+            (addressPXD(&iagp->inoext[extno]) == 0)) {
+                release_metapage(mp);
+                return -ESTALE;
+        }
+        /* get disk block number of the page within the inode extent
+         * that holds the disk inode.
+         */
+        blkno = INOPBLK(&iagp->inoext[extno], ino, sbi->l2nbperpage);
+        /* get the ag for the iag */
+        agstart = le64_to_cpu(iagp->agstart);
+        release_metapage(mp);
+        rel_inode = (ino & (INOSPERPAGE - 1));
+        pageno = blkno >> sbi->l2nbperpage;
+        if ((block_offset = ((u32) blkno & (sbi->nbperpage - 1)))) {
+                /*
+                 * OS/2 didn't always align inode extents on page boundaries
+                 */
+                inodes_left =
+                     (sbi->nbperpage - block_offset) << sbi->l2niperblk;
+                if (rel_inode < inodes_left)
+                        rel_inode += block_offset << sbi->l2niperblk;
+                else {
+                        pageno += 1;
+                        rel_inode -= inodes_left;
+                }
+        }
+        /* read the page of disk inode */
+        mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1);
+        if (mp == 0) {
+                jfs_err("diRead: read_metapage failed");
+                return -EIO;
+        }
+        /* locate the the disk inode requested */
+        dp = (struct dinode *) mp->data;
+        dp += rel_inode;
+        if (ip->i_ino != le32_to_cpu(dp->di_number)) {
+                jfs_error(ip->i_sb, "diRead: i_ino != di_number");
+                rc = -EIO;
+        } else if (le32_to_cpu(dp->di_nlink) == 0)
+                rc = -ESTALE;
+        else
+                /* copy the disk inode to the in-memory inode */
+                rc = copy_from_dinode(dp, ip);
+        release_metapage(mp);
+        /* set the ag for the inode */
+        JFS_IP(ip)->agno = BLKTOAG(agstart, sbi);
+        JFS_IP(ip)->active_ag = -1;
+        return (rc);
+}
+/*
+ * NAME:        diReadSpecial()
+ *
+ * FUNCTION:    initialize a 'special' inode from disk.
+ *
+ *              this routines handles aggregate level inodes.  The
+ *              inode cache cannot differentiate between the
+ *              aggregate inodes and the filesystem inodes, so we
+ *              handle these here.  We don't actually use the aggregate
+ *              inode map, since these inodes are at a fixed location
+ *              and in some cases the aggregate inode map isn't initialized
+ *              yet.
+ *
+ * PARAMETERS:
+ *      sb - filesystem superblock
+ *      inum - aggregate inode number
+ *      secondary - 1 if secondary aggregate inode table
+ *
+ * RETURN VALUES:
+ *      new inode       - success
+ *      NULL            - i/o error.
+ */
+struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
+{
+        struct jfs_sb_info *sbi = JFS_SBI(sb);
+        uint address;
+        struct dinode *dp;
+        struct inode *ip;
+        struct metapage *mp;
+        ip = new_inode(sb);
+        if (ip == NULL) {
+                jfs_err("diReadSpecial: new_inode returned NULL!");
+                return ip;
+        }
+        if (secondary) {
+                address = addressPXD(&sbi->ait2) >> sbi->l2nbperpage;
+                JFS_IP(ip)->ipimap = sbi->ipaimap2;
+        } else {
+                address = AITBL_OFF >> L2PSIZE;
+                JFS_IP(ip)->ipimap = sbi->ipaimap;
+        }
+        ASSERT(inum < INOSPEREXT);
+        ip->i_ino = inum;
+        address += inum >> 3;   /* 8 inodes per 4K page */
+        /* read the page of fixed disk inode (AIT) in raw mode */
+        mp = read_metapage(ip, address << sbi->l2nbperpage, PSIZE, 1);
+        if (mp == NULL) {
+                ip->i_nlink = 1;        /* Don't want iput() deleting it */
+                iput(ip);
+                return (NULL);
+        }
+        /* get the pointer to the disk inode of interest */
+        dp = (struct dinode *) (mp->data);
+        dp += inum % 8;         /* 8 inodes per 4K page */
+        /* copy on-disk inode to in-memory inode */
+        if ((copy_from_dinode(dp, ip)) != 0) {
+                /* handle bad return by returning NULL for ip */
+                ip->i_nlink = 1;        /* Don't want iput() deleting it */
+                iput(ip);
+                /* release the page */
+                release_metapage(mp);
+                return (NULL);
+        }
+        ip->i_mapping->a_ops = &jfs_aops;
+        mapping_set_gfp_mask(ip->i_mapping, GFP_NOFS);
+        /* Allocations to metadata inodes should not affect quotas */
+        ip->i_flags |= S_NOQUOTA;
+        if ((inum == FILESYSTEM_I) && (JFS_IP(ip)->ipimap == sbi->ipaimap)) {
+                sbi->gengen = le32_to_cpu(dp->di_gengen);
+                sbi->inostamp = le32_to_cpu(dp->di_inostamp);
+        }
+        /* release the page */
+        release_metapage(mp);
+        return (ip);
+}
+/*
+ * NAME:        diWriteSpecial()
+ *
+ * FUNCTION:    Write the special inode to disk
+ *
+ * PARAMETERS:
+ *      ip - special inode
+ *      secondary - 1 if secondary aggregate inode table
+ *
+ * RETURN VALUES: none
+ */
+void diWriteSpecial(struct inode *ip, int secondary)
+{
+        struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
+        uint address;
+        struct dinode *dp;
+        ino_t inum = ip->i_ino;
+        struct metapage *mp;
+        ip->i_state &= ~I_DIRTY;
+        if (secondary)
+                address = addressPXD(&sbi->ait2) >> sbi->l2nbperpage;
+        else
+                address = AITBL_OFF >> L2PSIZE;
+        ASSERT(inum < INOSPEREXT);
+        address += inum >> 3;   /* 8 inodes per 4K page */
+        /* read the page of fixed disk inode (AIT) in raw mode */
+        mp = read_metapage(ip, address << sbi->l2nbperpage, PSIZE, 1);
+        if (mp == NULL) {
+                jfs_err("diWriteSpecial: failed to read aggregate inode "
+                        "extent!");
+                return;
+        }
+        /* get the pointer to the disk inode of interest */
+        dp = (struct dinode *) (mp->data);
+        dp += inum % 8;         /* 8 inodes per 4K page */
+        /* copy on-disk inode to in-memory inode */
+        copy_to_dinode(dp, ip);
+        memcpy(&dp->di_xtroot, &JFS_IP(ip)->i_xtroot, 288);
+        if (inum == FILESYSTEM_I)
+                dp->di_gengen = cpu_to_le32(sbi->gengen);
+        /* write the page */
+        write_metapage(mp);
+}
+/*
+ * NAME:        diFreeSpecial()
+ *
+ * FUNCTION:    Free allocated space for special inode
+ */
+void diFreeSpecial(struct inode *ip)
+{
+        if (ip == NULL) {
+                jfs_err("diFreeSpecial called with NULL ip!");
+                return;
+        }
+        filemap_fdatawrite(ip->i_mapping);
+        filemap_fdatawait(ip->i_mapping);
+        truncate_inode_pages(ip->i_mapping, 0);
+        iput(ip);
+}
+/*
+ * NAME:        diWrite()
+ *
+ * FUNCTION:    write the on-disk inode portion of the in-memory inode
+ *              to its corresponding on-disk inode.
+ *
+ *              on entry, the specifed incore inode should itself
+ *              specify the disk inode number corresponding to the
+ *              incore inode (i.e. i_number should be initialized).
+ *
+ *              the inode contains the inode extent address for the disk
+ *              inode.  with the inode extent address in hand, the
+ *              page of the extent that contains the disk inode is
+ *              read and the disk inode portion of the incore inode
+ *              is copied to the disk inode.
+ *              
+ * PARAMETERS:
+ *      tid -  transacation id
+ *      ip  -  pointer to incore inode to be written to the inode extent.
+ *
+ * RETURN VALUES:
+ *      0       - success
+ *      -EIO    - i/o error.
+ */
+int diWrite(tid_t tid, struct inode *ip)
+{
+        struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
+        struct jfs_inode_info *jfs_ip = JFS_IP(ip);
+        int rc = 0;
+        s32 ino;
+        struct dinode *dp;
+        s64 blkno;
+        int block_offset;
+        int inodes_left;
+        struct metapage *mp;
+        uint pageno;
+        int rel_inode;
+        int dioffset;
+        struct inode *ipimap;
+        uint type;
+        lid_t lid;
+        struct tlock *ditlck, *tlck;
+        struct linelock *dilinelock, *ilinelock;
+        struct lv *lv;
+        int n;
+        ipimap = jfs_ip->ipimap;
+        ino = ip->i_ino & (INOSPERIAG - 1);
+        if (!addressPXD(&(jfs_ip->ixpxd)) ||
+            (lengthPXD(&(jfs_ip->ixpxd)) !=
+             JFS_IP(ipimap)->i_imap->im_nbperiext)) {
+                jfs_error(ip->i_sb, "diWrite: ixpxd invalid");
+                return -EIO;
+        }
+        /*
+         * read the page of disk inode containing the specified inode:
+         */
+        /* compute the block address of the page */
+        blkno = INOPBLK(&(jfs_ip->ixpxd), ino, sbi->l2nbperpage);
+        rel_inode = (ino & (INOSPERPAGE - 1));
+        pageno = blkno >> sbi->l2nbperpage;
+        if ((block_offset = ((u32) blkno & (sbi->nbperpage - 1)))) {
+                /*
+                 * OS/2 didn't always align inode extents on page boundaries
+                 */
+                inodes_left =
+                    (sbi->nbperpage - block_offset) << sbi->l2niperblk;
+                if (rel_inode < inodes_left)
+                        rel_inode += block_offset << sbi->l2niperblk;
+                else {
+                        pageno += 1;
+                        rel_inode -= inodes_left;
+                }
+        }
+        /* read the page of disk inode */
+      retry:
+        mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1);
+        if (mp == 0)
+                return -EIO;
+        /* get the pointer to the disk inode */
+        dp = (struct dinode *) mp->data;
+        dp += rel_inode;
+        dioffset = (ino & (INOSPERPAGE - 1)) << L2DISIZE;
+        /*
+         * acquire transaction lock on the on-disk inode;
+         * N.B. tlock is acquired on ipimap not ip;
+         */
+        if ((ditlck =
+             txLock(tid, ipimap, mp, tlckINODE | tlckENTRY)) == NULL)
+                goto retry;
+        dilinelock = (struct linelock *) & ditlck->lock;
+        /*
+         * copy btree root from in-memory inode to on-disk inode
+         *
+         * (tlock is taken from inline B+-tree root in in-memory
+         * inode when the B+-tree root is updated, which is pointed 
+         * by jfs_ip->blid as well as being on tx tlock list)
+         *
+         * further processing of btree root is based on the copy 
+         * in in-memory inode, where txLog() will log from, and, 
+         * for xtree root, txUpdateMap() will update map and reset
+         * XAD_NEW bit;
+         */
+        if (S_ISDIR(ip->i_mode) && (lid = jfs_ip->xtlid)) {
+                /*
+                 * This is the special xtree inside the directory for storing
+                 * the directory table
+                 */
+                xtpage_t *p, *xp;
+                xad_t *xad;
+                jfs_ip->xtlid = 0;
+                tlck = lid_to_tlock(lid);
+                assert(tlck->type & tlckXTREE);
+                tlck->type |= tlckBTROOT;
+                tlck->mp = mp;
+                ilinelock = (struct linelock *) & tlck->lock;
+                /*
+                 * copy xtree root from inode to dinode:
+                 */
+                p = &jfs_ip->i_xtroot;
+                xp = (xtpage_t *) &dp->di_dirtable;
+                lv = ilinelock->lv;
+                for (n = 0; n < ilinelock->index; n++, lv++) {
+                        memcpy(&xp->xad[lv->offset], &p->xad[lv->offset],
+                               lv->length << L2XTSLOTSIZE);
+                }
+                /* reset on-disk (metadata page) xtree XAD_NEW bit */
+                xad = &xp->xad[XTENTRYSTART];
+                for (n = XTENTRYSTART;
+                     n < le16_to_cpu(xp->header.nextindex); n++, xad++)
+                        if (xad->flag & (XAD_NEW | XAD_EXTENDED))
+                                xad->flag &= ~(XAD_NEW | XAD_EXTENDED);
+        }
+        if ((lid = jfs_ip->blid) == 0)
+                goto inlineData;
+        jfs_ip->blid = 0;
+        tlck = lid_to_tlock(lid);
+        type = tlck->type;
+        tlck->type |= tlckBTROOT;
+        tlck->mp = mp;
+        ilinelock = (struct linelock *) & tlck->lock;
+        /*
+         *      regular file: 16 byte (XAD slot) granularity
+         */
+        if (type & tlckXTREE) {
+                xtpage_t *p, *xp;
+                xad_t *xad;
+                /*
+                 * copy xtree root from inode to dinode:
+                 */
+                p = &jfs_ip->i_xtroot;
+                xp = &dp->di_xtroot;
+                lv = ilinelock->lv;
+                for (n = 0; n < ilinelock->index; n++, lv++) {
+                        memcpy(&xp->xad[lv->offset], &p->xad[lv->offset],
+                               lv->length << L2XTSLOTSIZE);
+                }
+                /* reset on-disk (metadata page) xtree XAD_NEW bit */
+                xad = &xp->xad[XTENTRYSTART];
+                for (n = XTENTRYSTART;
+                     n < le16_to_cpu(xp->header.nextindex); n++, xad++)
+                        if (xad->flag & (XAD_NEW | XAD_EXTENDED))
+                                xad->flag &= ~(XAD_NEW | XAD_EXTENDED);
+        }
+        /*
+         *      directory: 32 byte (directory entry slot) granularity
+         */
+        else if (type & tlckDTREE) {
+                dtpage_t *p, *xp;
+                /*
+                 * copy dtree root from inode to dinode:
+                 */
+                p = (dtpage_t *) &jfs_ip->i_dtroot;
+                xp = (dtpage_t *) & dp->di_dtroot;
+                lv = ilinelock->lv;
+                for (n = 0; n < ilinelock->index; n++, lv++) {
+                        memcpy(&xp->slot[lv->offset], &p->slot[lv->offset],
+                               lv->length << L2DTSLOTSIZE);
+                }
+        } else {
+                jfs_err("diWrite: UFO tlock");
+        }
+      inlineData:
+        /*
+         * copy inline symlink from in-memory inode to on-disk inode
+         */
+        if (S_ISLNK(ip->i_mode) && ip->i_size < IDATASIZE) {
+                lv = & dilinelock->lv[dilinelock->index];
+                lv->offset = (dioffset + 2 * 128) >> L2INODESLOTSIZE;
+                lv->length = 2;
+                memcpy(&dp->di_fastsymlink, jfs_ip->i_inline, IDATASIZE);
+                dilinelock->index++;
+        }
+        /*
+         * copy inline data from in-memory inode to on-disk inode:
+         * 128 byte slot granularity
+         */
+        if (test_cflag(COMMIT_Inlineea, ip)) {
+                lv = & dilinelock->lv[dilinelock->index];
+                lv->offset = (dioffset + 3 * 128) >> L2INODESLOTSIZE;
+                lv->length = 1;
+                memcpy(&dp->di_inlineea, jfs_ip->i_inline_ea, INODESLOTSIZE);
+                dilinelock->index++;
+                clear_cflag(COMMIT_Inlineea, ip);
+        }
+        /*
+         *      lock/copy inode base: 128 byte slot granularity
+         */
+// baseDinode:
+        lv = & dilinelock->lv[dilinelock->index];
+        lv->offset = dioffset >> L2INODESLOTSIZE;
+        copy_to_dinode(dp, ip);
+        if (test_and_clear_cflag(COMMIT_Dirtable, ip)) {
+                lv->length = 2;
+                memcpy(&dp->di_dirtable, &jfs_ip->i_dirtable, 96);
+        } else
+                lv->length = 1;
+        dilinelock->index++;
+#ifdef _JFS_FASTDASD
+        /*
+         * We aren't logging changes to the DASD used in directory inodes,
+         * but we need to write them to disk.  If we don't unmount cleanly,
+         * mount will recalculate the DASD used.
+         */
+        if (S_ISDIR(ip->i_mode)
+            && (ip->i_ipmnt->i_mntflag & JFS_DASD_ENABLED))
+                memcpy(&dp->di_DASD, &ip->i_DASD, sizeof(struct dasd));
+#endif                          /*  _JFS_FASTDASD */
+        /* release the buffer holding the updated on-disk inode. 
+         * the buffer will be later written by commit processing.
+         */
+        write_metapage(mp);
+        return (rc);
+}
+/*
+ * NAME:        diFree(ip)
+ *
+ * FUNCTION:    free a specified inode from the inode working map
+ *              for a fileset or aggregate.
+ *
+ *              if the inode to be freed represents the first (only)
+ *              free inode within the iag, the iag will be placed on
+ *              the ag free inode list.
+ *      
+ *              freeing the inode will cause the inode extent to be
+ *              freed if the inode is the only allocated inode within
+ *              the extent.  in this case all the disk resource backing
+ *              up the inode extent will be freed. in addition, the iag
+ *              will be placed on the ag extent free list if the extent
+ *              is the first free extent in the iag.  if freeing the
+ *              extent also means that no free inodes will exist for
+ *              the iag, the iag will also be removed from the ag free
+ *              inode list.
+ *
+ *              the iag describing the inode will be freed if the extent
+ *              is to be freed and it is the only backed extent within
+ *              the iag.  in this case, the iag will be removed from the
+ *              ag free extent list and ag free inode list and placed on
+ *              the inode map's free iag list.
+ *
+ *              a careful update approach is used to provide consistency
+ *              in the face of updates to multiple buffers.  under this
+ *              approach, all required buffers are obtained before making
+ *              any updates and are held until all updates are complete.
+ *
+ * PARAMETERS:
+ *      ip      - inode to be freed.
+ *
+ * RETURN VALUES:
+ *      0       - success
+ *      -EIO    - i/o error.
+ */
+int diFree(struct inode *ip)
+{
+        int rc;
+        ino_t inum = ip->i_ino;
+        struct iag *iagp, *aiagp, *biagp, *ciagp, *diagp;
+        struct metapage *mp, *amp, *bmp, *cmp, *dmp;
+        int iagno, ino, extno, bitno, sword, agno;
+        int back, fwd;
+        u32 bitmap, mask;
+        struct inode *ipimap = JFS_SBI(ip->i_sb)->ipimap;
+        struct inomap *imap = JFS_IP(ipimap)->i_imap;
+        pxd_t freepxd;
+        tid_t tid;
+        struct inode *iplist[3];
+        struct tlock *tlck;
+        struct pxd_lock *pxdlock;
+        /*
+         * This is just to suppress compiler warnings.  The same logic that
+         * references these variables is used to initialize them.
+         */
+        aiagp = biagp = ciagp = diagp = NULL;
+        /* get the iag number containing the inode.
+         */
+        iagno = INOTOIAG(inum);
+        /* make sure that the iag is contained within 
+         * the map.
+         */
+        if (iagno >= imap->im_nextiag) {
+                dump_mem("imap", imap, 32);
+                jfs_error(ip->i_sb,
+                          "diFree: inum = %d, iagno = %d, nextiag = %d",
+                          (uint) inum, iagno, imap->im_nextiag);
+                return -EIO;
+        }
+        /* get the allocation group for this ino.
+         */
+        agno = JFS_IP(ip)->agno;
+        /* Lock the AG specific inode map information
+         */
+        AG_LOCK(imap, agno);
+        /* Obtain read lock in imap inode.  Don't release it until we have
+         * read all of the IAG's that we are going to.
+         */
+        IREAD_LOCK(ipimap);
+        /* read the iag.
+         */
+        if ((rc = diIAGRead(imap, iagno, &mp))) {
+                IREAD_UNLOCK(ipimap);
+                AG_UNLOCK(imap, agno);
+                return (rc);
+        }
+        iagp = (struct iag *) mp->data;
+        /* get the inode number and extent number of the inode within
+         * the iag and the inode number within the extent.
+         */
+        ino = inum & (INOSPERIAG - 1);
+        extno = ino >> L2INOSPEREXT;
+        bitno = ino & (INOSPEREXT - 1);
+        mask = HIGHORDER >> bitno;
+        if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) {
+                jfs_error(ip->i_sb,
+                          "diFree: wmap shows inode already free");
+        }
+        if (!addressPXD(&iagp->inoext[extno])) {
+                release_metapage(mp);
+                IREAD_UNLOCK(ipimap);
+                AG_UNLOCK(imap, agno);
+                jfs_error(ip->i_sb, "diFree: invalid inoext");
+                return -EIO;
+        }
+        /* compute the bitmap for the extent reflecting the freed inode.
+         */
+        bitmap = le32_to_cpu(iagp->wmap[extno]) & ~mask;
+        if (imap->im_agctl[agno].numfree > imap->im_agctl[agno].numinos) {
+                release_metapage(mp);
+                IREAD_UNLOCK(ipimap);
+                AG_UNLOCK(imap, agno);
+                jfs_error(ip->i_sb, "diFree: numfree > numinos");
+                return -EIO;
+        }
+        /*
+         *      inode extent still has some inodes or below low water mark:
+         *      keep the inode extent;
+         */
+        if (bitmap ||
+            imap->im_agctl[agno].numfree < 96 ||
+            (imap->im_agctl[agno].numfree < 288 &&
+             (((imap->im_agctl[agno].numfree * 100) /
+               imap->im_agctl[agno].numinos) <= 25))) {
+                /* if the iag currently has no free inodes (i.e.,
+                 * the inode being freed is the first free inode of iag),
+                 * insert the iag at head of the inode free list for the ag.
+                 */
+                if (iagp->nfreeinos == 0) {
+                        /* check if there are any iags on the ag inode
+                         * free list.  if so, read the first one so that
+                         * we can link the current iag onto the list at
+                         * the head.
+                         */
+                        if ((fwd = imap->im_agctl[agno].inofree) >= 0) {
+                                /* read the iag that currently is the head
+                                 * of the list.
+                                 */
+                                if ((rc = diIAGRead(imap, fwd, &amp))) {
+                                        IREAD_UNLOCK(ipimap);
+                                        AG_UNLOCK(imap, agno);
+                                        release_metapage(mp);
+                                        return (rc);
+                                }
+                                aiagp = (struct iag *) amp->data;
+                                /* make current head point back to the iag.
+                                 */
+                                aiagp->inofreeback = cpu_to_le32(iagno);
+                                write_metapage(amp);
+                        }
+                        /* iag points forward to current head and iag
+                         * becomes the new head of the list.
+                         */
+                        iagp->inofreefwd =
+                            cpu_to_le32(imap->im_agctl[agno].inofree);
+                        iagp->inofreeback = cpu_to_le32(-1);
+                        imap->im_agctl[agno].inofree = iagno;
+                }
+                IREAD_UNLOCK(ipimap);
+                /* update the free inode summary map for the extent if
+                 * freeing the inode means the extent will now have free
+                 * inodes (i.e., the inode being freed is the first free 
+                 * inode of extent),
+                 */
+                if (iagp->wmap[extno] == cpu_to_le32(ONES)) {
+                        sword = extno >> L2EXTSPERSUM;
+                        bitno = extno & (EXTSPERSUM - 1);
+                        iagp->inosmap[sword] &=
+                            cpu_to_le32(~(HIGHORDER >> bitno));
+                }
+                /* update the bitmap.
+                 */
+                iagp->wmap[extno] = cpu_to_le32(bitmap);
+                DBG_DIFREE(imap, inum);
+                /* update the free inode counts at the iag, ag and
+                 * map level.
+                 */
+                iagp->nfreeinos =
+                    cpu_to_le32(le32_to_cpu(iagp->nfreeinos) + 1);
+                imap->im_agctl[agno].numfree += 1;
+                atomic_inc(&imap->im_numfree);
+                /* release the AG inode map lock
+                 */
+                AG_UNLOCK(imap, agno);
+                /* write the iag */
+                write_metapage(mp);
+                return (0);
+        }
+        /*
+         *      inode extent has become free and above low water mark:
+         *      free the inode extent;
+         */
+        /*
+         *      prepare to update iag list(s) (careful update step 1)
+         */
+        amp = bmp = cmp = dmp = NULL;
+        fwd = back = -1;
+        /* check if the iag currently has no free extents.  if so,
+         * it will be placed on the head of the ag extent free list.
+         */
+        if (iagp->nfreeexts == 0) {
+                /* check if the ag extent free list has any iags.
+                 * if so, read the iag at the head of the list now.
+                 * this (head) iag will be updated later to reflect
+                 * the addition of the current iag at the head of
+                 * the list.
+                 */
+                if ((fwd = imap->im_agctl[agno].extfree) >= 0) {
+                        if ((rc = diIAGRead(imap, fwd, &amp)))
+                                goto error_out;
+                        aiagp = (struct iag *) amp->data;
+                }
+        } else {
+                /* iag has free extents. check if the addition of a free
+                 * extent will cause all extents to be free within this
+                 * iag.  if so, the iag will be removed from the ag extent
+                 * free list and placed on the inode map's free iag list.
+                 */
+                if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG - 1)) {
+                        /* in preparation for removing the iag from the
+                         * ag extent free list, read the iags preceeding
+                         * and following the iag on the ag extent free
+                         * list.
+                         */
+                        if ((fwd = le32_to_cpu(iagp->extfreefwd)) >= 0) {
+                                if ((rc = diIAGRead(imap, fwd, &amp)))
+                                        goto error_out;
+                                aiagp = (struct iag *) amp->data;
+                        }
+                        if ((back = le32_to_cpu(iagp->extfreeback)) >= 0) {
+                                if ((rc = diIAGRead(imap, back, &bmp)))
+                                        goto error_out;
+                                biagp = (struct iag *) bmp->data;
+                        }
+                }
+        }
+        /* remove the iag from the ag inode free list if freeing
+         * this extent cause the iag to have no free inodes.
+         */
+        if (iagp->nfreeinos == cpu_to_le32(INOSPEREXT - 1)) {
+                int inofreeback = le32_to_cpu(iagp->inofreeback);
+                int inofreefwd = le32_to_cpu(iagp->inofreefwd);
+                /* in preparation for removing the iag from the
+                 * ag inode free list, read the iags preceeding
+                 * and following the iag on the ag inode free
+                 * list.  before reading these iags, we must make
+                 * sure that we already don't have them in hand
+                 * from up above, since re-reading an iag (buffer)
+                 * we are currently holding would cause a deadlock.
+                 */
+                if (inofreefwd >= 0) {
+                        if (inofreefwd == fwd)
+                                ciagp = (struct iag *) amp->data;
+                        else if (inofreefwd == back)
+                                ciagp = (struct iag *) bmp->data;
+                        else {
+                                if ((rc =
+                                     diIAGRead(imap, inofreefwd, &cmp)))
+                                        goto error_out;
+                                ciagp = (struct iag *) cmp->data;
+                        }
+                        assert(ciagp != NULL);
+                }
+                if (inofreeback >= 0) {
+                        if (inofreeback == fwd)
+                                diagp = (struct iag *) amp->data;
+                        else if (inofreeback == back)
+                                diagp = (struct iag *) bmp->data;
+                        else {
+                                if ((rc =
+                                     diIAGRead(imap, inofreeback, &dmp)))
+                                        goto error_out;
+                                diagp = (struct iag *) dmp->data;
+                        }
+                        assert(diagp != NULL);
+                }
+        }
+        IREAD_UNLOCK(ipimap);
+        /*
+         * invalidate any page of the inode extent freed from buffer cache;
+         */
+        freepxd = iagp->inoext[extno];
+        invalidate_pxd_metapages(ip, freepxd);
+        /*
+         *      update iag list(s) (careful update step 2)
+         */
+        /* add the iag to the ag extent free list if this is the
+         * first free extent for the iag.
+         */
+        if (iagp->nfreeexts == 0) {
+                if (fwd >= 0)
+                        aiagp->extfreeback = cpu_to_le32(iagno);
+                iagp->extfreefwd =
+                    cpu_to_le32(imap->im_agctl[agno].extfree);
+                iagp->extfreeback = cpu_to_le32(-1);
+                imap->im_agctl[agno].extfree = iagno;
+        } else {
+                /* remove the iag from the ag extent list if all extents
+                 * are now free and place it on the inode map iag free list.
+                 */
+                if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG - 1)) {
+                        if (fwd >= 0)
+                                aiagp->extfreeback = iagp->extfreeback;
+                        if (back >= 0)
+                                biagp->extfreefwd = iagp->extfreefwd;
+                        else
+                                imap->im_agctl[agno].extfree =
+                                    le32_to_cpu(iagp->extfreefwd);
+                        iagp->extfreefwd = iagp->extfreeback = cpu_to_le32(-1);
+                        IAGFREE_LOCK(imap);
+                        iagp->iagfree = cpu_to_le32(imap->im_freeiag);
+                        imap->im_freeiag = iagno;
+                        IAGFREE_UNLOCK(imap);
+                }
+        }
+        /* remove the iag from the ag inode free list if freeing
+         * this extent causes the iag to have no free inodes.
+         */
+        if (iagp->nfreeinos == cpu_to_le32(INOSPEREXT - 1)) {
+                if ((int) le32_to_cpu(iagp->inofreefwd) >= 0)
+                        ciagp->inofreeback = iagp->inofreeback;
+                if ((int) le32_to_cpu(iagp->inofreeback) >= 0)
+                        diagp->inofreefwd = iagp->inofreefwd;
+                else
+                        imap->im_agctl[agno].inofree =
+                            le32_to_cpu(iagp->inofreefwd);
+                iagp->inofreefwd = iagp->inofreeback = cpu_to_le32(-1);
+        }
+        /* update the inode extent address and working map 
+         * to reflect the free extent.
+         * the permanent map should have been updated already 
+         * for the inode being freed.
+         */
+        if (iagp->pmap[extno] != 0) {
+                jfs_error(ip->i_sb, "diFree: the pmap does not show inode free");
+        }
+        iagp->wmap[extno] = 0;
+        DBG_DIFREE(imap, inum);
+        PXDlength(&iagp->inoext[extno], 0);
+        PXDaddress(&iagp->inoext[extno], 0);
+        /* update the free extent and free inode summary maps
+         * to reflect the freed extent.
+         * the inode summary map is marked to indicate no inodes 
+         * available for the freed extent.
+         */
+        sword = extno >> L2EXTSPERSUM;
+        bitno = extno & (EXTSPERSUM - 1);
+        mask = HIGHORDER >> bitno;
+        iagp->inosmap[sword] |= cpu_to_le32(mask);
+        iagp->extsmap[sword] &= cpu_to_le32(~mask);
+        /* update the number of free inodes and number of free extents
+         * for the iag.
+         */
+        iagp->nfreeinos = cpu_to_le32(le32_to_cpu(iagp->nfreeinos) -
+                                      (INOSPEREXT - 1));
+        iagp->nfreeexts = cpu_to_le32(le32_to_cpu(iagp->nfreeexts) + 1);
+        /* update the number of free inodes and backed inodes
+         * at the ag and inode map level.
+         */
+        imap->im_agctl[agno].numfree -= (INOSPEREXT - 1);
+        imap->im_agctl[agno].numinos -= INOSPEREXT;
+        atomic_sub(INOSPEREXT - 1, &imap->im_numfree);
+        atomic_sub(INOSPEREXT, &imap->im_numinos);
+        if (amp)
+                write_metapage(amp);
+        if (bmp)
+                write_metapage(bmp);
+        if (cmp)
+                write_metapage(cmp);
+        if (dmp)
+                write_metapage(dmp);
+        /*
+         * start transaction to update block allocation map
+         * for the inode extent freed;
+         *
+         * N.B. AG_LOCK is released and iag will be released below, and 
+         * other thread may allocate inode from/reusing the ixad freed
+         * BUT with new/different backing inode extent from the extent 
+         * to be freed by the transaction;  
+         */
+        tid = txBegin(ipimap->i_sb, COMMIT_FORCE);
+        down(&JFS_IP(ipimap)->commit_sem);
+        /* acquire tlock of the iag page of the freed ixad 
+         * to force the page NOHOMEOK (even though no data is
+         * logged from the iag page) until NOREDOPAGE|FREEXTENT log 
+         * for the free of the extent is committed;
+         * write FREEXTENT|NOREDOPAGE log record
+         * N.B. linelock is overlaid as freed extent descriptor;
+         */
+        tlck = txLock(tid, ipimap, mp, tlckINODE | tlckFREE);
+        pxdlock = (struct pxd_lock *) & tlck->lock;
+        pxdlock->flag = mlckFREEPXD;
+        pxdlock->pxd = freepxd;
+        pxdlock->index = 1;
+        write_metapage(mp);
+        iplist[0] = ipimap;
+        /*
+         * logredo needs the IAG number and IAG extent index in order
+         * to ensure that the IMap is consistent.  The least disruptive
+         * way to pass these values through  to the transaction manager
+         * is in the iplist array.  
+         * 
+         * It's not pretty, but it works.
+         */
+        iplist[1] = (struct inode *) (size_t)iagno;
+        iplist[2] = (struct inode *) (size_t)extno;
+        rc = txCommit(tid, 1, &iplist[0], COMMIT_FORCE);
+        txEnd(tid);
+        up(&JFS_IP(ipimap)->commit_sem);
+        /* unlock the AG inode map information */
+        AG_UNLOCK(imap, agno);
+        return (0);
+      error_out:
+        IREAD_UNLOCK(ipimap);
+        if (amp)
+                release_metapage(amp);
+        if (bmp)
+                release_metapage(bmp);
+        if (cmp)
+                release_metapage(cmp);
+        if (dmp)
+                release_metapage(dmp);
+        AG_UNLOCK(imap, agno);
+        release_metapage(mp);
+        return (rc);
+}
+/*
+ * There are several places in the diAlloc* routines where we initialize
+ * the inode.
+ */
+static inline void
+diInitInode(struct inode *ip, int iagno, int ino, int extno, struct iag * iagp)
+{
+        struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
+        struct jfs_inode_info *jfs_ip = JFS_IP(ip);
+        ip->i_ino = (iagno << L2INOSPERIAG) + ino;
+        DBG_DIALLOC(JFS_IP(ipimap)->i_imap, ip->i_ino);
+        jfs_ip->ixpxd = iagp->inoext[extno];
+        jfs_ip->agno = BLKTOAG(le64_to_cpu(iagp->agstart), sbi);
+        jfs_ip->active_ag = -1;
+}
+/*
+ * NAME:        diAlloc(pip,dir,ip)
+ *
+ * FUNCTION:    allocate a disk inode from the inode working map 
+ *              for a fileset or aggregate.
+ *
+ * PARAMETERS:
+ *      pip     - pointer to incore inode for the parent inode.
+ *      dir     - TRUE if the new disk inode is for a directory.
+ *      ip      - pointer to a new inode
+ *
+ * RETURN VALUES:
+ *      0       - success.
+ *      -ENOSPC - insufficient disk resources.
+ *      -EIO    - i/o error.
+ */
+int diAlloc(struct inode *pip, boolean_t dir, struct inode *ip)
+{
+        int rc, ino, iagno, addext, extno, bitno, sword;
+        int nwords, rem, i, agno;
+        u32 mask, inosmap, extsmap;
+        struct inode *ipimap;
+        struct metapage *mp;
+        ino_t inum;
+        struct iag *iagp;
+        struct inomap *imap;
+        /* get the pointers to the inode map inode and the
+         * corresponding imap control structure.
+         */
+        ipimap = JFS_SBI(pip->i_sb)->ipimap;
+        imap = JFS_IP(ipimap)->i_imap;
+        JFS_IP(ip)->ipimap = ipimap;
+        JFS_IP(ip)->fileset = FILESYSTEM_I;
+        /* for a directory, the allocation policy is to start 
+         * at the ag level using the preferred ag.
+         */
+        if (dir == TRUE) {
+                agno = dbNextAG(JFS_SBI(pip->i_sb)->ipbmap);
+                AG_LOCK(imap, agno);
+                goto tryag;
+        }
+        /* for files, the policy starts off by trying to allocate from
+         * the same iag containing the parent disk inode:
+         * try to allocate the new disk inode close to the parent disk
+         * inode, using parent disk inode number + 1 as the allocation
+         * hint.  (we use a left-to-right policy to attempt to avoid
+         * moving backward on the disk.)  compute the hint within the
+         * file system and the iag.
+         */
+        /* get the ag number of this iag */
+        agno = JFS_IP(pip)->agno;
+        if (atomic_read(&JFS_SBI(pip->i_sb)->bmap->db_active[agno])) {
+                /*
+                 * There is an open file actively growing.  We want to
+                 * allocate new inodes from a different ag to avoid
+                 * fragmentation problems.
+                 */
+                agno = dbNextAG(JFS_SBI(pip->i_sb)->ipbmap);
+                AG_LOCK(imap, agno);
+                goto tryag;
+        }
+        inum = pip->i_ino + 1;
+        ino = inum & (INOSPERIAG - 1);
+        /* back off the the hint if it is outside of the iag */
+        if (ino == 0)
+                inum = pip->i_ino;
+        /* lock the AG inode map information */
+        AG_LOCK(imap, agno);
+        /* Get read lock on imap inode */
+        IREAD_LOCK(ipimap);
+        /* get the iag number and read the iag */
+        iagno = INOTOIAG(inum);
+        if ((rc = diIAGRead(imap, iagno, &mp))) {
+                IREAD_UNLOCK(ipimap);
+                AG_UNLOCK(imap, agno);
+                return (rc);
+        }
+        iagp = (struct iag *) mp->data;
+        /* determine if new inode extent is allowed to be added to the iag.
+         * new inode extent can be added to the iag if the ag
+         * has less than 32 free disk inodes and the iag has free extents.
+         */
+        addext = (imap->im_agctl[agno].numfree < 32 && iagp->nfreeexts);
+        /*
+         *      try to allocate from the IAG
+         */
+        /* check if the inode may be allocated from the iag 
+         * (i.e. the inode has free inodes or new extent can be added).
+         */
+        if (iagp->nfreeinos || addext) {
+                /* determine the extent number of the hint.
+                 */
+                extno = ino >> L2INOSPEREXT;
+                /* check if the extent containing the hint has backed
+                 * inodes.  if so, try to allocate within this extent.
+                 */
+                if (addressPXD(&iagp->inoext[extno])) {
+                        bitno = ino & (INOSPEREXT - 1);
+                        if ((bitno =
+                             diFindFree(le32_to_cpu(iagp->wmap[extno]),
+                                        bitno))
+                            < INOSPEREXT) {
+                                ino = (extno << L2INOSPEREXT) + bitno;
+                                /* a free inode (bit) was found within this
+                                 * extent, so allocate it.
+                                 */
+                                rc = diAllocBit(imap, iagp, ino);
+                                IREAD_UNLOCK(ipimap);
+                                if (rc) {
+                                        assert(rc == -EIO);
+                                } else {
+                                        /* set the results of the allocation
+                                         * and write the iag.
+                                         */
+                                        diInitInode(ip, iagno, ino, extno,
+                                                    iagp);
+                                        mark_metapage_dirty(mp);
+                                }
+                                release_metapage(mp);
+                                /* free the AG lock and return.
+                                 */
+                                AG_UNLOCK(imap, agno);
+                                return (rc);
+                        }
+                        if (!addext)
+                                extno =
+                                    (extno ==
+                                     EXTSPERIAG - 1) ? 0 : extno + 1;
+                }
+                /*
+                 * no free inodes within the extent containing the hint.
+                 *
+                 * try to allocate from the backed extents following
+                 * hint or, if appropriate (i.e. addext is true), allocate
+                 * an extent of free inodes at or following the extent
+                 * containing the hint.
+                 * 
+                 * the free inode and free extent summary maps are used
+                 * here, so determine the starting summary map position
+                 * and the number of words we'll have to examine.  again,
+                 * the approach is to allocate following the hint, so we
+                 * might have to initially ignore prior bits of the summary
+                 * map that represent extents prior to the extent containing
+                 * the hint and later revisit these bits.
+                 */
+                bitno = extno & (EXTSPERSUM - 1);
+                nwords = (bitno == 0) ? SMAPSZ : SMAPSZ + 1;
+                sword = extno >> L2EXTSPERSUM;
+                /* mask any prior bits for the starting words of the
+                 * summary map.
+                 */
+                mask = ONES << (EXTSPERSUM - bitno);
+                inosmap = le32_to_cpu(iagp->inosmap[sword]) | mask;
+                extsmap = le32_to_cpu(iagp->extsmap[sword]) | mask;
+                /* scan the free inode and free extent summary maps for
+                 * free resources.
+                 */
+                for (i = 0; i < nwords; i++) {
+                        /* check if this word of the free inode summary
+                         * map describes an extent with free inodes.
+                         */
+                        if (~inosmap) {
+                                /* an extent with free inodes has been
+                                 * found. determine the extent number
+                                 * and the inode number within the extent.
+                                 */
+                                rem = diFindFree(inosmap, 0);
+                                extno = (sword << L2EXTSPERSUM) + rem;
+                                rem = diFindFree(le32_to_cpu(iagp->wmap[extno]),
+                                                 0);
+                                if (rem >= INOSPEREXT) {
+                                        IREAD_UNLOCK(ipimap);
+                                        release_metapage(mp);
+                                        AG_UNLOCK(imap, agno);
+                                        jfs_error(ip->i_sb,
+                                                  "diAlloc: can't find free bit "
+                                                  "in wmap");
+                                        return EIO;
+                                }
+                                /* determine the inode number within the
+                                 * iag and allocate the inode from the
+                                 * map.
+                                 */
+                                ino = (extno << L2INOSPEREXT) + rem;
+                                rc = diAllocBit(imap, iagp, ino);
+                                IREAD_UNLOCK(ipimap);
+                                if (rc)
+                                        assert(rc == -EIO);
+                                else {
+                                        /* set the results of the allocation
+                                         * and write the iag.
+                                         */
+                                        diInitInode(ip, iagno, ino, extno,
+                                                    iagp);
+                                        mark_metapage_dirty(mp);
+                                }
+                                release_metapage(mp);
+                                /* free the AG lock and return.
+                                 */
+                                AG_UNLOCK(imap, agno);
+                                return (rc);
+                        }
+                        /* check if we may allocate an extent of free
+                         * inodes and whether this word of the free
+                         * extents summary map describes a free extent.
+                         */
+                        if (addext && ~extsmap) {
+                                /* a free extent has been found.  determine
+                                 * the extent number.
+                                 */
+                                rem = diFindFree(extsmap, 0);
+                                extno = (sword << L2EXTSPERSUM) + rem;
+                                /* allocate an extent of free inodes.
+                                 */
+                                if ((rc = diNewExt(imap, iagp, extno))) {
+                                        /* if there is no disk space for a
+                                         * new extent, try to allocate the
+                                         * disk inode from somewhere else.
+                                         */
+                                        if (rc == -ENOSPC)
+                                                break;
+                                        assert(rc == -EIO);
+                                } else {
+                                        /* set the results of the allocation
+                                         * and write the iag.
+                                         */
+                                        diInitInode(ip, iagno,
+                                                    extno << L2INOSPEREXT,
+                                                    extno, iagp);
+                                        mark_metapage_dirty(mp);
+                                }
+                                release_metapage(mp);
+                                /* free the imap inode & the AG lock & return.
+                                 */
+                                IREAD_UNLOCK(ipimap);
+                                AG_UNLOCK(imap, agno);
+                                return (rc);
+                        }
+                        /* move on to the next set of summary map words.
+                         */
+                        sword = (sword == SMAPSZ - 1) ? 0 : sword + 1;
+                        inosmap = le32_to_cpu(iagp->inosmap[sword]);
+                        extsmap = le32_to_cpu(iagp->extsmap[sword]);
+                }
+        }
+        /* unlock imap inode */
+        IREAD_UNLOCK(ipimap);
+        /* nothing doing in this iag, so release it. */
+        release_metapage(mp);
+      tryag:
+        /*
+         * try to allocate anywhere within the same AG as the parent inode.
+         */
+        rc = diAllocAG(imap, agno, dir, ip);
+        AG_UNLOCK(imap, agno);
+        if (rc != -ENOSPC)
+                return (rc);
+        /*
+         * try to allocate in any AG.
+         */
+        return (diAllocAny(imap, agno, dir, ip));
+}
+/*
+ * NAME:        diAllocAG(imap,agno,dir,ip)
+ *
+ * FUNCTION:    allocate a disk inode from the allocation group.
+ *
+ *              this routine first determines if a new extent of free
+ *              inodes should be added for the allocation group, with
+ *              the current request satisfied from this extent. if this
+ *              is the case, an attempt will be made to do just that.  if
+ *              this attempt fails or it has been determined that a new 
+ *              extent should not be added, an attempt is made to satisfy
+ *              the request by allocating an existing (backed) free inode
+ *              from the allocation group.
+ *
+ * PRE CONDITION: Already have the AG lock for this AG.
+ *
+ * PARAMETERS:
+ *      imap    - pointer to inode map control structure.
+ *      agno    - allocation group to allocate from.
+ *      dir     - TRUE if the new disk inode is for a directory.
+ *      ip      - pointer to the new inode to be filled in on successful return
+ *                with the disk inode number allocated, its extent address
+ *                and the start of the ag.
+ *
+ * RETURN VALUES:
+ *      0       - success.
+ *      -ENOSPC - insufficient disk resources.
+ *      -EIO    - i/o error.
+ */
+static int
+diAllocAG(struct inomap * imap, int agno, boolean_t dir, struct inode *ip)
+{
+        int rc, addext, numfree, numinos;
+        /* get the number of free and the number of backed disk 
+         * inodes currently within the ag.
+         */
+        numfree = imap->im_agctl[agno].numfree;
+        numinos = imap->im_agctl[agno].numinos;
+        if (numfree > numinos) {
+                jfs_error(ip->i_sb, "diAllocAG: numfree > numinos");
+                return -EIO;
+        }
+        /* determine if we should allocate a new extent of free inodes
+         * within the ag: for directory inodes, add a new extent
+         * if there are a small number of free inodes or number of free
+         * inodes is a small percentage of the number of backed inodes.
+         */
+        if (dir == TRUE)
+                addext = (numfree < 64 ||
+                          (numfree < 256
+                           && ((numfree * 100) / numinos) <= 20));
+        else
+                addext = (numfree == 0);
+        /*
+         * try to allocate a new extent of free inodes.
+         */
+        if (addext) {
+                /* if free space is not avaliable for this new extent, try
+                 * below to allocate a free and existing (already backed)
+                 * inode from the ag.
+                 */
+                if ((rc = diAllocExt(imap, agno, ip)) != -ENOSPC)
+                        return (rc);
+        }
+        /*
+         * try to allocate an existing free inode from the ag.
+         */
+        return (diAllocIno(imap, agno, ip));
+}
+/*
+ * NAME:        diAllocAny(imap,agno,dir,iap)
+ *
+ * FUNCTION:    allocate a disk inode from any other allocation group.
+ *
+ *              this routine is called when an allocation attempt within
+ *              the primary allocation group has failed. if attempts to
+ *              allocate an inode from any allocation group other than the
+ *              specified primary group.
+ *
+ * PARAMETERS:
+ *      imap    - pointer to inode map control structure.
+ *      agno    - primary allocation group (to avoid).
+ *      dir     - TRUE if the new disk inode is for a directory.
+ *      ip      - pointer to a new inode to be filled in on successful return
+ *                with the disk inode number allocated, its extent address
+ *                and the start of the ag.
+ *
+ * RETURN VALUES:
+ *      0       - success.
+ *      -ENOSPC - insufficient disk resources.
+ *      -EIO    - i/o error.
+ */
+static int
+diAllocAny(struct inomap * imap, int agno, boolean_t dir, struct inode *ip)
+{
+        int ag, rc;
+        int maxag = JFS_SBI(imap->im_ipimap->i_sb)->bmap->db_maxag;
+        /* try to allocate from the ags following agno up to 
+         * the maximum ag number.
+         */
+        for (ag = agno + 1; ag <= maxag; ag++) {
+                AG_LOCK(imap, ag);
+                rc = diAllocAG(imap, ag, dir, ip);
+                AG_UNLOCK(imap, ag);
+                if (rc != -ENOSPC)
+                        return (rc);
+        }
+        /* try to allocate from the ags in front of agno.
+         */
+        for (ag = 0; ag < agno; ag++) {
+                AG_LOCK(imap, ag);
+                rc = diAllocAG(imap, ag, dir, ip);
+                AG_UNLOCK(imap, ag);
+                if (rc != -ENOSPC)
+                        return (rc);
+        }
+        /* no free disk inodes.
+         */
+        return -ENOSPC;
+}
+/*
+ * NAME:        diAllocIno(imap,agno,ip)
+ *
+ * FUNCTION:    allocate a disk inode from the allocation group's free
+ *              inode list, returning an error if this free list is
+ *              empty (i.e. no iags on the list).
+ *
+ *              allocation occurs from the first iag on the list using
+ *              the iag's free inode summary map to find the leftmost
+ *              free inode in the iag. 
+ *              
+ * PRE CONDITION: Already have AG lock for this AG.
+ *              
+ * PARAMETERS:
+ *      imap    - pointer to inode map control structure.
+ *      agno    - allocation group.
+ *      ip      - pointer to new inode to be filled in on successful return
+ *                with the disk inode number allocated, its extent address
+ *                and the start of the ag.
+ *
+ * RETURN VALUES:
+ *      0       - success.
+ *      -ENOSPC - insufficient disk resources.
+ *      -EIO    - i/o error.
+ */
+static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
+{
+        int iagno, ino, rc, rem, extno, sword;
+        struct metapage *mp;
+        struct iag *iagp;
+        /* check if there are iags on the ag's free inode list.
+         */
+        if ((iagno = imap->im_agctl[agno].inofree) < 0)
+                return -ENOSPC;
+        /* obtain read lock on imap inode */
+        IREAD_LOCK(imap->im_ipimap);
+        /* read the iag at the head of the list.
+         */
+        if ((rc = diIAGRead(imap, iagno, &mp))) {
+                IREAD_UNLOCK(imap->im_ipimap);
+                return (rc);
+        }
+        iagp = (struct iag *) mp->data;
+        /* better be free inodes in this iag if it is on the
+         * list.
+         */
+        if (!iagp->nfreeinos) {
+                IREAD_UNLOCK(imap->im_ipimap);
+                release_metapage(mp);
+                jfs_error(ip->i_sb,
+                          "diAllocIno: nfreeinos = 0, but iag on freelist");
+                return -EIO;
+        }
+        /* scan the free inode summary map to find an extent
+         * with free inodes.
+         */
+        for (sword = 0;; sword++) {
+                if (sword >= SMAPSZ) {
+                        IREAD_UNLOCK(imap->im_ipimap);
+                        release_metapage(mp);
+                        jfs_error(ip->i_sb,
+                                  "diAllocIno: free inode not found in summary map");
+                        return -EIO;
+                }
+                if (~iagp->inosmap[sword])
+                        break;
+        }
+        /* found a extent with free inodes. determine
+         * the extent number.
+         */
+        rem = diFindFree(le32_to_cpu(iagp->inosmap[sword]), 0);
+        if (rem >= EXTSPERSUM) {
+                IREAD_UNLOCK(imap->im_ipimap);
+                release_metapage(mp);
+                jfs_error(ip->i_sb, "diAllocIno: no free extent found");
+                return -EIO;
+        }
+        extno = (sword << L2EXTSPERSUM) + rem;
+        /* find the first free inode in the extent.
+         */
+        rem = diFindFree(le32_to_cpu(iagp->wmap[extno]), 0);
+        if (rem >= INOSPEREXT) {
+                IREAD_UNLOCK(imap->im_ipimap);
+                release_metapage(mp);
+                jfs_error(ip->i_sb, "diAllocIno: free inode not found");
+                return -EIO;
+        }
+        /* compute the inode number within the iag. 
+         */
+        ino = (extno << L2INOSPEREXT) + rem;
+        /* allocate the inode.
+         */
+        rc = diAllocBit(imap, iagp, ino);
+        IREAD_UNLOCK(imap->im_ipimap);
+        if (rc) {
+                release_metapage(mp);
+                return (rc);
+        }
+        /* set the results of the allocation and write the iag.
+         */
+        diInitInode(ip, iagno, ino, extno, iagp);
+        write_metapage(mp);
+        return (0);
+}
+/*
+ * NAME:        diAllocExt(imap,agno,ip)
+ *
+ * FUNCTION:    add a new extent of free inodes to an iag, allocating
+ *              an inode from this extent to satisfy the current allocation
+ *              request.
+ *              
+ *              this routine first tries to find an existing iag with free
+ *              extents through the ag free extent list.  if list is not
+ *              empty, the head of the list will be selected as the home
+ *              of the new extent of free inodes.  otherwise (the list is
+ *              empty), a new iag will be allocated for the ag to contain
+ *              the extent.
+ *              
+ *              once an iag has been selected, the free extent summary map
+ *              is used to locate a free extent within the iag and diNewExt()
+ *              is called to initialize the extent, with initialization
+ *              including the allocation of the first inode of the extent
+ *              for the purpose of satisfying this request.
+ *
+ * PARAMETERS:
+ *      imap    - pointer to inode map control structure.
+ *      agno    - allocation group number.
+ *      ip      - pointer to new inode to be filled in on successful return
+ *                with the disk inode number allocated, its extent address
+ *                and the start of the ag.
+ *
+ * RETURN VALUES:
+ *      0       - success.
+ *      -ENOSPC - insufficient disk resources.
+ *      -EIO    - i/o error.
+ */
+static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
+{
+        int rem, iagno, sword, extno, rc;
+        struct metapage *mp;
+        struct iag *iagp;
+        /* check if the ag has any iags with free extents.  if not,
+         * allocate a new iag for the ag.
+         */
+        if ((iagno = imap->im_agctl[agno].extfree) < 0) {
+                /* If successful, diNewIAG will obtain the read lock on the
+                 * imap inode.
+                 */
+                if ((rc = diNewIAG(imap, &iagno, agno, &mp))) {
+                        return (rc);
+                }
+                iagp = (struct iag *) mp->data;
+                /* set the ag number if this a brand new iag
+                 */
+                iagp->agstart =
+                    cpu_to_le64(AGTOBLK(agno, imap->im_ipimap));
+        } else {
+                /* read the iag.
+                 */
+                IREAD_LOCK(imap->im_ipimap);
+                if ((rc = diIAGRead(imap, iagno, &mp))) {
+                        IREAD_UNLOCK(imap->im_ipimap);
+                        jfs_error(ip->i_sb, "diAllocExt: error reading iag");
+                        return rc;
+                }
+                iagp = (struct iag *) mp->data;
+        }
+        /* using the free extent summary map, find a free extent.
+         */
+        for (sword = 0;; sword++) {
+                if (sword >= SMAPSZ) {
+                        release_metapage(mp);
+                        IREAD_UNLOCK(imap->im_ipimap);
+                        jfs_error(ip->i_sb,
+                                  "diAllocExt: free ext summary map not found");
+                        return -EIO;
+                }
+                if (~iagp->extsmap[sword])
+                        break;
+        }
+        /* determine the extent number of the free extent.
+         */
+        rem = diFindFree(le32_to_cpu(iagp->extsmap[sword]), 0);
+        if (rem >= EXTSPERSUM) {
+                release_metapage(mp);
+                IREAD_UNLOCK(imap->im_ipimap);
+                jfs_error(ip->i_sb, "diAllocExt: free extent not found");
+                return -EIO;
+        }
+        extno = (sword << L2EXTSPERSUM) + rem;
+        /* initialize the new extent.
+         */
+        rc = diNewExt(imap, iagp, extno);
+        IREAD_UNLOCK(imap->im_ipimap);
+        if (rc) {
+                /* something bad happened.  if a new iag was allocated,
+                 * place it back on the inode map's iag free list, and
+                 * clear the ag number information.
+                 */
+                if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) {
+                        IAGFREE_LOCK(imap);
+                        iagp->iagfree = cpu_to_le32(imap->im_freeiag);
+                        imap->im_freeiag = iagno;
+                        IAGFREE_UNLOCK(imap);
+                }
+                write_metapage(mp);
+                return (rc);
+        }
+        /* set the results of the allocation and write the iag.
+         */
+        diInitInode(ip, iagno, extno << L2INOSPEREXT, extno, iagp);
+        write_metapage(mp);
+        return (0);
+}
+/*
+ * NAME:        diAllocBit(imap,iagp,ino)
+ *
+ * FUNCTION:    allocate a backed inode from an iag.
+ *
+ *              this routine performs the mechanics of allocating a
+ *              specified inode from a backed extent.
+ *
+ *              if the inode to be allocated represents the last free
+ *              inode within the iag, the iag will be removed from the
+ *              ag free inode list.
+ *
+ *              a careful update approach is used to provide consistency
+ *              in the face of updates to multiple buffers.  under this
+ *              approach, all required buffers are obtained before making
+ *              any updates and are held all are updates are complete.
+ *              
+ * PRE CONDITION: Already have buffer lock on iagp.  Already have AG lock on
+ *      this AG.  Must have read lock on imap inode.
+ *
+ * PARAMETERS:
+ *      imap    - pointer to inode map control structure.
+ *      iagp    - pointer to iag. 
+ *      ino     - inode number to be allocated within the iag.
+ *
+ * RETURN VALUES:
+ *      0       - success.
+ *      -ENOSPC - insufficient disk resources.
+ *      -EIO    - i/o error.
+ */
+static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino)
+{
+        int extno, bitno, agno, sword, rc;
+        struct metapage *amp = NULL, *bmp = NULL;
+        struct iag *aiagp = NULL, *biagp = NULL;
+        u32 mask;
+        /* check if this is the last free inode within the iag.
+         * if so, it will have to be removed from the ag free
+         * inode list, so get the iags preceeding and following
+         * it on the list.
+         */
+        if (iagp->nfreeinos == cpu_to_le32(1)) {
+                if ((int) le32_to_cpu(iagp->inofreefwd) >= 0) {
+                        if ((rc =
+                             diIAGRead(imap, le32_to_cpu(iagp->inofreefwd),
+                                       &amp)))
+                                return (rc);
+                        aiagp = (struct iag *) amp->data;
+                }
+                if ((int) le32_to_cpu(iagp->inofreeback) >= 0) {
+                        if ((rc =
+                             diIAGRead(imap,
+                                       le32_to_cpu(iagp->inofreeback),
+                                       &bmp))) {
+                                if (amp)
+                                        release_metapage(amp);
+                                return (rc);
+                        }
+                        biagp = (struct iag *) bmp->data;
+                }
+        }
+        /* get the ag number, extent number, inode number within
+         * the extent.
+         */
+        agno = BLKTOAG(le64_to_cpu(iagp->agstart), JFS_SBI(imap->im_ipimap->i_sb));
+        extno = ino >> L2INOSPEREXT;
+        bitno = ino & (INOSPEREXT - 1);
+        /* compute the mask for setting the map.
+         */
+        mask = HIGHORDER >> bitno;
+        /* the inode should be free and backed.
+         */
+        if (((le32_to_cpu(iagp->pmap[extno]) & mask) != 0) ||
+            ((le32_to_cpu(iagp->wmap[extno]) & mask) != 0) ||
+            (addressPXD(&iagp->inoext[extno]) == 0)) {
+                if (amp)
+                        release_metapage(amp);
+                if (bmp)
+                        release_metapage(bmp);
+                jfs_error(imap->im_ipimap->i_sb,
+                          "diAllocBit: iag inconsistent");
+                return -EIO;
+        }
+        /* mark the inode as allocated in the working map.
+         */
+        iagp->wmap[extno] |= cpu_to_le32(mask);
+        /* check if all inodes within the extent are now
+         * allocated.  if so, update the free inode summary
+         * map to reflect this.
+         */
+        if (iagp->wmap[extno] == cpu_to_le32(ONES)) {
+                sword = extno >> L2EXTSPERSUM;
+                bitno = extno & (EXTSPERSUM - 1);
+                iagp->inosmap[sword] |= cpu_to_le32(HIGHORDER >> bitno);
+        }
+        /* if this was the last free inode in the iag, remove the
+         * iag from the ag free inode list.
+         */
+        if (iagp->nfreeinos == cpu_to_le32(1)) {
+                if (amp) {
+                        aiagp->inofreeback = iagp->inofreeback;
+                        write_metapage(amp);
+                }
+                if (bmp) {
+                        biagp->inofreefwd = iagp->inofreefwd;
+                        write_metapage(bmp);
+                } else {
+                        imap->im_agctl[agno].inofree =
+                            le32_to_cpu(iagp->inofreefwd);
+                }
+                iagp->inofreefwd = iagp->inofreeback = cpu_to_le32(-1);
+        }
+        /* update the free inode count at the iag, ag, inode
+         * map levels.
+         */
+        iagp->nfreeinos = cpu_to_le32(le32_to_cpu(iagp->nfreeinos) - 1);
+        imap->im_agctl[agno].numfree -= 1;
+        atomic_dec(&imap->im_numfree);
+        return (0);
+}
+/*
+ * NAME:        diNewExt(imap,iagp,extno)
+ *
+ * FUNCTION:    initialize a new extent of inodes for an iag, allocating
+ *              the first inode of the extent for use for the current
+ *              allocation request.
+ *
+ *              disk resources are allocated for the new extent of inodes
+ *              and the inodes themselves are initialized to reflect their
+ *              existence within the extent (i.e. their inode numbers and
+ *              inode extent addresses are set) and their initial state
+ *              (mode and link count are set to zero).
+ *
+ *              if the iag is new, it is not yet on an ag extent free list
+ *              but will now be placed on this list.
+ *
+ *              if the allocation of the new extent causes the iag to
+ *              have no free extent, the iag will be removed from the
+ *              ag extent free list.
+ *
+ *              if the iag has no free backed inodes, it will be placed
+ *              on the ag free inode list, since the addition of the new
+ *              extent will now cause it to have free inodes.
+ *
+ *              a careful update approach is used to provide consistency
+ *              (i.e. list consistency) in the face of updates to multiple
+ *              buffers.  under this approach, all required buffers are
+ *              obtained before making any updates and are held until all
+ *              updates are complete.
+ *              
+ * PRE CONDITION: Already have buffer lock on iagp.  Already have AG lock on
+ *      this AG.  Must have read lock on imap inode.
+ *
+ * PARAMETERS:
+ *      imap    - pointer to inode map control structure.
+ *      iagp    - pointer to iag. 
+ *      extno   - extent number.
+ *
+ * RETURN VALUES:
+ *      0       - success.
+ *      -ENOSPC - insufficient disk resources.
+ *      -EIO    - i/o error.
+ */
+static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
+{
+        int agno, iagno, fwd, back, freei = 0, sword, rc;
+        struct iag *aiagp = NULL, *biagp = NULL, *ciagp = NULL;
+        struct metapage *amp, *bmp, *cmp, *dmp;
+        struct inode *ipimap;
+        s64 blkno, hint;
+        int i, j;
+        u32 mask;
+        ino_t ino;
+        struct dinode *dp;
+        struct jfs_sb_info *sbi;
+        /* better have free extents.
+         */
+        if (!iagp->nfreeexts) {
+                jfs_error(imap->im_ipimap->i_sb, "diNewExt: no free extents");
+                return -EIO;
+        }
+        /* get the inode map inode.
+         */
+        ipimap = imap->im_ipimap;
+        sbi = JFS_SBI(ipimap->i_sb);
+        amp = bmp = cmp = NULL;
+        /* get the ag and iag numbers for this iag.
+         */
+        agno = BLKTOAG(le64_to_cpu(iagp->agstart), sbi);
+        iagno = le32_to_cpu(iagp->iagnum);
+        /* check if this is the last free extent within the
+         * iag.  if so, the iag must be removed from the ag
+         * free extent list, so get the iags preceeding and
+         * following the iag on this list.
+         */
+        if (iagp->nfreeexts == cpu_to_le32(1)) {
+                if ((fwd = le32_to_cpu(iagp->extfreefwd)) >= 0) {
+                        if ((rc = diIAGRead(imap, fwd, &amp)))
+                                return (rc);
+                        aiagp = (struct iag *) amp->data;
+                }
+                if ((back = le32_to_cpu(iagp->extfreeback)) >= 0) {
+                        if ((rc = diIAGRead(imap, back, &bmp)))
+                                goto error_out;
+                        biagp = (struct iag *) bmp->data;
+                }
+        } else {
+                /* the iag has free extents.  if all extents are free
+                 * (as is the case for a newly allocated iag), the iag
+                 * must be added to the ag free extent list, so get
+                 * the iag at the head of the list in preparation for
+                 * adding this iag to this list.
+                 */
+                fwd = back = -1;
+                if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) {
+                        if ((fwd = imap->im_agctl[agno].extfree) >= 0) {
+                                if ((rc = diIAGRead(imap, fwd, &amp)))
+                                        goto error_out;
+                                aiagp = (struct iag *) amp->data;
+                        }
+                }
+        }
+        /* check if the iag has no free inodes.  if so, the iag
+         * will have to be added to the ag free inode list, so get
+         * the iag at the head of the list in preparation for
+         * adding this iag to this list.  in doing this, we must
+         * check if we already have the iag at the head of
+         * the list in hand.
+         */
+        if (iagp->nfreeinos == 0) {
+                freei = imap->im_agctl[agno].inofree;
+                if (freei >= 0) {
+                        if (freei == fwd) {
+                                ciagp = aiagp;
+                        } else if (freei == back) {
+                                ciagp = biagp;
+                        } else {
+                                if ((rc = diIAGRead(imap, freei, &cmp)))
+                                        goto error_out;
+                                ciagp = (struct iag *) cmp->data;
+                        }
+                        if (ciagp == NULL) {
+                                jfs_error(imap->im_ipimap->i_sb,
+                                          "diNewExt: ciagp == NULL");
+                                rc = -EIO;
+                                goto error_out;
+                        }
+                }
+        }
+        /* allocate disk space for the inode extent.
+         */
+        if ((extno == 0) || (addressPXD(&iagp->inoext[extno - 1]) == 0))
+                hint = ((s64) agno << sbi->bmap->db_agl2size) - 1;
+        else
+                hint = addressPXD(&iagp->inoext[extno - 1]) +
+                    lengthPXD(&iagp->inoext[extno - 1]) - 1;
+        if ((rc = dbAlloc(ipimap, hint, (s64) imap->im_nbperiext, &blkno)))
+                goto error_out;
+        /* compute the inode number of the first inode within the
+         * extent.
+         */
+        ino = (iagno << L2INOSPERIAG) + (extno << L2INOSPEREXT);
+        /* initialize the inodes within the newly allocated extent a
+         * page at a time.
+         */
+        for (i = 0; i < imap->im_nbperiext; i += sbi->nbperpage) {
+                /* get a buffer for this page of disk inodes.
+                 */
+                dmp = get_metapage(ipimap, blkno + i, PSIZE, 1);
+                if (dmp == NULL) {
+                        rc = -EIO;
+                        goto error_out;
+                }
+                dp = (struct dinode *) dmp->data;
+                /* initialize the inode number, mode, link count and
+                 * inode extent address.
+                 */
+                for (j = 0; j < INOSPERPAGE; j++, dp++, ino++) {
+                        dp->di_inostamp = cpu_to_le32(sbi->inostamp);
+                        dp->di_number = cpu_to_le32(ino);
+                        dp->di_fileset = cpu_to_le32(FILESYSTEM_I);
+                        dp->di_mode = 0;
+                        dp->di_nlink = 0;
+                        PXDaddress(&(dp->di_ixpxd), blkno);
+                        PXDlength(&(dp->di_ixpxd), imap->im_nbperiext);
+                }
+                write_metapage(dmp);
+        }
+        /* if this is the last free extent within the iag, remove the
+         * iag from the ag free extent list.
+         */
+        if (iagp->nfreeexts == cpu_to_le32(1)) {
+                if (fwd >= 0)
+                        aiagp->extfreeback = iagp->extfreeback;
+                if (back >= 0)
+                        biagp->extfreefwd = iagp->extfreefwd;
+                else
+                        imap->im_agctl[agno].extfree =
+                            le32_to_cpu(iagp->extfreefwd);
+                iagp->extfreefwd = iagp->extfreeback = cpu_to_le32(-1);
+        } else {
+                /* if the iag has all free extents (newly allocated iag),
+                 * add the iag to the ag free extent list.
+                 */
+                if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) {
+                        if (fwd >= 0)
+                                aiagp->extfreeback = cpu_to_le32(iagno);
+                        iagp->extfreefwd = cpu_to_le32(fwd);
+                        iagp->extfreeback = cpu_to_le32(-1);
+                        imap->im_agctl[agno].extfree = iagno;
+                }
+        }
+        /* if the iag has no free inodes, add the iag to the
+         * ag free inode list.
+         */
+        if (iagp->nfreeinos == 0) {
+                if (freei >= 0)
+                        ciagp->inofreeback = cpu_to_le32(iagno);
+                iagp->inofreefwd =
+                    cpu_to_le32(imap->im_agctl[agno].inofree);
+                iagp->inofreeback = cpu_to_le32(-1);
+                imap->im_agctl[agno].inofree = iagno;
+        }
+        /* initialize the extent descriptor of the extent. */
+        PXDlength(&iagp->inoext[extno], imap->im_nbperiext);
+        PXDaddress(&iagp->inoext[extno], blkno);
+        /* initialize the working and persistent map of the extent.
+         * the working map will be initialized such that
+         * it indicates the first inode of the extent is allocated.
+         */
+        iagp->wmap[extno] = cpu_to_le32(HIGHORDER);
+        iagp->pmap[extno] = 0;
+        /* update the free inode and free extent summary maps
+         * for the extent to indicate the extent has free inodes
+         * and no longer represents a free extent.
+         */
+        sword = extno >> L2EXTSPERSUM;
+        mask = HIGHORDER >> (extno & (EXTSPERSUM - 1));
+        iagp->extsmap[sword] |= cpu_to_le32(mask);
+        iagp->inosmap[sword] &= cpu_to_le32(~mask);
+        /* update the free inode and free extent counts for the
+         * iag.
+         */
+        iagp->nfreeinos = cpu_to_le32(le32_to_cpu(iagp->nfreeinos) +
+                                      (INOSPEREXT - 1));
+        iagp->nfreeexts = cpu_to_le32(le32_to_cpu(iagp->nfreeexts) - 1);
+        /* update the free and backed inode counts for the ag.
+         */
+        imap->im_agctl[agno].numfree += (INOSPEREXT - 1);
+        imap->im_agctl[agno].numinos += INOSPEREXT;
+        /* update the free and backed inode counts for the inode map.
+         */
+        atomic_add(INOSPEREXT - 1, &imap->im_numfree);
+        atomic_add(INOSPEREXT, &imap->im_numinos);
+        /* write the iags.
+         */
+        if (amp)
+                write_metapage(amp);
+        if (bmp)
+                write_metapage(bmp);
+        if (cmp)
+                write_metapage(cmp);
+        return (0);
+      error_out:
+        /* release the iags.
+         */
+        if (amp)
+                release_metapage(amp);
+        if (bmp)
+                release_metapage(bmp);
+        if (cmp)
+                release_metapage(cmp);
+        return (rc);
+}
+/*
+ * NAME:        diNewIAG(imap,iagnop,agno)
+ *
+ * FUNCTION:    allocate a new iag for an allocation group.
+ *              
+ *              first tries to allocate the iag from the inode map 
+ *              iagfree list:  
+ *              if the list has free iags, the head of the list is removed 
+ *              and returned to satisfy the request.
+ *              if the inode map's iag free list is empty, the inode map
+ *              is extended to hold a new iag. this new iag is initialized
+ *              and returned to satisfy the request.
+ *
+ * PARAMETERS:
+ *      imap    - pointer to inode map control structure.
+ *      iagnop  - pointer to an iag number set with the number of the
+ *                newly allocated iag upon successful return.
+ *      agno    - allocation group number.
+ *      bpp     - Buffer pointer to be filled in with new IAG's buffer
+ *
+ * RETURN VALUES:
+ *      0       - success.
+ *      -ENOSPC - insufficient disk resources.
+ *      -EIO    - i/o error.
+ *
+ * serialization: 
+ *      AG lock held on entry/exit;
+ *      write lock on the map is held inside;
+ *      read lock on the map is held on successful completion;
+ *
+ * note: new iag transaction: 
+ * . synchronously write iag;
+ * . write log of xtree and inode  of imap;
+ * . commit;
+ * . synchronous write of xtree (right to left, bottom to top);
+ * . at start of logredo(): init in-memory imap with one additional iag page;
+ * . at end of logredo(): re-read imap inode to determine
+ *   new imap size;
+ */
+static int
+diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
+{
+        int rc;
+        int iagno, i, xlen;
+        struct inode *ipimap;
+        struct super_block *sb;
+        struct jfs_sb_info *sbi;
+        struct metapage *mp;
+        struct iag *iagp;
+        s64 xaddr = 0;
+        s64 blkno;
+        tid_t tid;
+#ifdef _STILL_TO_PORT
+        xad_t xad;
+#endif                          /*  _STILL_TO_PORT */
+        struct inode *iplist[1];
+        /* pick up pointers to the inode map and mount inodes */
+        ipimap = imap->im_ipimap;
+        sb = ipimap->i_sb;
+        sbi = JFS_SBI(sb);
+        /* acquire the free iag lock */
+        IAGFREE_LOCK(imap);
+        /* if there are any iags on the inode map free iag list, 
+         * allocate the iag from the head of the list.
+         */
+        if (imap->im_freeiag >= 0) {
+                /* pick up the iag number at the head of the list */
+                iagno = imap->im_freeiag;
+                /* determine the logical block number of the iag */
+                blkno = IAGTOLBLK(iagno, sbi->l2nbperpage);
+        } else {
+                /* no free iags. the inode map will have to be extented
+                 * to include a new iag.
+                 */
+                /* acquire inode map lock */
+                IWRITE_LOCK(ipimap);
+                if (ipimap->i_size >> L2PSIZE != imap->im_nextiag + 1) {
+                        IWRITE_UNLOCK(ipimap);
+                        IAGFREE_UNLOCK(imap);
+                        jfs_error(imap->im_ipimap->i_sb,
+                                  "diNewIAG: ipimap->i_size is wrong");
+                        return -EIO;
+                }
+                /* get the next avaliable iag number */
+                iagno = imap->im_nextiag;
+                /* make sure that we have not exceeded the maximum inode
+                 * number limit.
+                 */
+                if (iagno > (MAXIAGS - 1)) {
+                        /* release the inode map lock */
+                        IWRITE_UNLOCK(ipimap);
+                        rc = -ENOSPC;
+                        goto out;
+                }
+                /*
+                 * synchronously append new iag page.
+                 */
+                /* determine the logical address of iag page to append */
+                blkno = IAGTOLBLK(iagno, sbi->l2nbperpage);
+                /* Allocate extent for new iag page */
+                xlen = sbi->nbperpage;
+                if ((rc = dbAlloc(ipimap, 0, (s64) xlen, &xaddr))) {
+                        /* release the inode map lock */
+                        IWRITE_UNLOCK(ipimap);
+                        goto out;
+                }
+                /* assign a buffer for the page */
+                mp = get_metapage(ipimap, xaddr, PSIZE, 1);
+                if (!mp) {
+                        /* Free the blocks allocated for the iag since it was
+                         * not successfully added to the inode map
+                         */
+                        dbFree(ipimap, xaddr, (s64) xlen);
+                        /* release the inode map lock */
+                        IWRITE_UNLOCK(ipimap);
+                        rc = -EIO;
+                        goto out;
+                }
+                iagp = (struct iag *) mp->data;
+                /* init the iag */
+                memset(iagp, 0, sizeof(struct iag));
+                iagp->iagnum = cpu_to_le32(iagno);
+                iagp->inofreefwd = iagp->inofreeback = cpu_to_le32(-1);
+                iagp->extfreefwd = iagp->extfreeback = cpu_to_le32(-1);
+                iagp->iagfree = cpu_to_le32(-1);
+                iagp->nfreeinos = 0;
+                iagp->nfreeexts = cpu_to_le32(EXTSPERIAG);
+                /* initialize the free inode summary map (free extent
+                 * summary map initialization handled by bzero).
+                 */
+                for (i = 0; i < SMAPSZ; i++)
+                        iagp->inosmap[i] = cpu_to_le32(ONES);
+                /*
+                 * Invalidate the page after writing and syncing it.
+                 * After it's initialized, we access it in a different
+                 * address space
+                 */
+                set_bit(META_discard, &mp->flag);
+                flush_metapage(mp);
+                /*
+                 * start tyransaction of update of the inode map
+                 * addressing structure pointing to the new iag page;
+                 */
+                tid = txBegin(sb, COMMIT_FORCE);
+                down(&JFS_IP(ipimap)->commit_sem);
+                /* update the inode map addressing structure to point to it */
+                if ((rc =
+                     xtInsert(tid, ipimap, 0, blkno, xlen, &xaddr, 0))) {
+                        txEnd(tid);
+                        up(&JFS_IP(ipimap)->commit_sem);
+                        /* Free the blocks allocated for the iag since it was
+                         * not successfully added to the inode map
+                         */
+                        dbFree(ipimap, xaddr, (s64) xlen);
+                        /* release the inode map lock */
+                        IWRITE_UNLOCK(ipimap);
+                        goto out;
+                }
+                /* update the inode map's inode to reflect the extension */
+                ipimap->i_size += PSIZE;
+                inode_add_bytes(ipimap, PSIZE);
+                /*
+                 * txCommit(COMMIT_FORCE) will synchronously write address 
+                 * index pages and inode after commit in careful update order 
+                 * of address index pages (right to left, bottom up);
+                 */
+                iplist[0] = ipimap;
+                rc = txCommit(tid, 1, &iplist[0], COMMIT_FORCE);
+                txEnd(tid);
+                up(&JFS_IP(ipimap)->commit_sem);
+                duplicateIXtree(sb, blkno, xlen, &xaddr);
+                /* update the next avaliable iag number */
+                imap->im_nextiag += 1;
+                /* Add the iag to the iag free list so we don't lose the iag
+                 * if a failure happens now.
+                 */
+                imap->im_freeiag = iagno;
+                /* Until we have logredo working, we want the imap inode &
+                 * control page to be up to date.
+                 */
+                diSync(ipimap);
+                /* release the inode map lock */
+                IWRITE_UNLOCK(ipimap);
+        }
+        /* obtain read lock on map */
+        IREAD_LOCK(ipimap);
+        /* read the iag */
+        if ((rc = diIAGRead(imap, iagno, &mp))) {
+                IREAD_UNLOCK(ipimap);
+                rc = -EIO;
+                goto out;
+        }
+        iagp = (struct iag *) mp->data;
+        /* remove the iag from the iag free list */
+        imap->im_freeiag = le32_to_cpu(iagp->iagfree);
+        iagp->iagfree = cpu_to_le32(-1);
+        /* set the return iag number and buffer pointer */
+        *iagnop = iagno;
+        *mpp = mp;
+      out:
+        /* release the iag free lock */
+        IAGFREE_UNLOCK(imap);
+        return (rc);
+}
+/*
+ * NAME:        diIAGRead()
+ *
+ * FUNCTION:    get the buffer for the specified iag within a fileset
+ *              or aggregate inode map.
+ *              
+ * PARAMETERS:
+ *      imap    - pointer to inode map control structure.
+ *      iagno   - iag number.
+ *      bpp     - point to buffer pointer to be filled in on successful
+ *                exit.
+ *
+ * SERIALIZATION:
+ *      must have read lock on imap inode
+ *      (When called by diExtendFS, the filesystem is quiesced, therefore
+ *       the read lock is unnecessary.)
+ *
+ * RETURN VALUES:
+ *      0       - success.
+ *      -EIO    - i/o error.
+ */
+static int diIAGRead(struct inomap * imap, int iagno, struct metapage ** mpp)
+{
+        struct inode *ipimap = imap->im_ipimap;
+        s64 blkno;
+        /* compute the logical block number of the iag. */
+        blkno = IAGTOLBLK(iagno, JFS_SBI(ipimap->i_sb)->l2nbperpage);
+        /* read the iag. */
+        *mpp = read_metapage(ipimap, blkno, PSIZE, 0);
+        if (*mpp == NULL) {
+                return -EIO;
+        }
+        return (0);
+}
+/*
+ * NAME:        diFindFree()
+ *
+ * FUNCTION:    find the first free bit in a word starting at
+ *              the specified bit position.
+ *
+ * PARAMETERS:
+ *      word    - word to be examined.
+ *      start   - starting bit position.
+ *
+ * RETURN VALUES:
+ *      bit position of first free bit in the word or 32 if
+ *      no free bits were found.
+ */
+static int diFindFree(u32 word, int start)
+{
+        int bitno;
+        assert(start < 32);
+        /* scan the word for the first free bit. */
+        for (word <<= start, bitno = start; bitno < 32;
+             bitno++, word <<= 1) {
+                if ((word & HIGHORDER) == 0)
+                        break;
+        }
+        return (bitno);
+}
+/*
+ * NAME:        diUpdatePMap()
+ *                                                                    
+ * FUNCTION: Update the persistent map in an IAG for the allocation or 
+ *      freeing of the specified inode.
+ *                                                                    
+ * PRE CONDITIONS: Working map has already been updated for allocate.
+ *
+ * PARAMETERS:
+ *      ipimap  - Incore inode map inode
+ *      inum    - Number of inode to mark in permanent map
+ *      is_free - If TRUE indicates inode should be marked freed, otherwise
+ *                indicates inode should be marked allocated.
+ *
+ * RETURN VALUES: 
+ *              0 for success
+ */
+int
+diUpdatePMap(struct inode *ipimap,
+             unsigned long inum, boolean_t is_free, struct tblock * tblk)
+{
+        int rc;
+        struct iag *iagp;
+        struct metapage *mp;
+        int iagno, ino, extno, bitno;
+        struct inomap *imap;
+        u32 mask;
+        struct jfs_log *log;
+        int lsn, difft, diffp;
+        imap = JFS_IP(ipimap)->i_imap;
+        /* get the iag number containing the inode */
+        iagno = INOTOIAG(inum);
+        /* make sure that the iag is contained within the map */
+        if (iagno >= imap->im_nextiag) {
+                jfs_error(ipimap->i_sb,
+                          "diUpdatePMap: the iag is outside the map");
+                return -EIO;
+        }
+        /* read the iag */
+        IREAD_LOCK(ipimap);
+        rc = diIAGRead(imap, iagno, &mp);
+        IREAD_UNLOCK(ipimap);
+        if (rc)
+                return (rc);
+        iagp = (struct iag *) mp->data;
+        /* get the inode number and extent number of the inode within
+         * the iag and the inode number within the extent.
+         */
+        ino = inum & (INOSPERIAG - 1);
+        extno = ino >> L2INOSPEREXT;
+        bitno = ino & (INOSPEREXT - 1);
+        mask = HIGHORDER >> bitno;
+        /* 
+         * mark the inode free in persistent map:
+         */
+        if (is_free == TRUE) {
+                /* The inode should have been allocated both in working
+                 * map and in persistent map;
+                 * the inode will be freed from working map at the release
+                 * of last reference release;
+                 */
+                if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) {
+                        jfs_error(ipimap->i_sb, 
+                                  "diUpdatePMap: inode %ld not marked as "
+                                  "allocated in wmap!", inum);
+                }
+                if (!(le32_to_cpu(iagp->pmap[extno]) & mask)) {
+                        jfs_error(ipimap->i_sb,
+                                  "diUpdatePMap: inode %ld not marked as "
+                                  "allocated in pmap!", inum);
+                }
+                /* update the bitmap for the extent of the freed inode */
+                iagp->pmap[extno] &= cpu_to_le32(~mask);
+        }
+        /*
+         * mark the inode allocated in persistent map:
+         */
+        else {
+                /* The inode should be already allocated in the working map
+                 * and should be free in persistent map;
+                 */
+                if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) {
+                        release_metapage(mp);
+                        jfs_error(ipimap->i_sb,
+                                  "diUpdatePMap: the inode is not allocated in "
+                                  "the working map");
+                        return -EIO;
+                }
+                if ((le32_to_cpu(iagp->pmap[extno]) & mask) != 0) {
+                        release_metapage(mp);
+                        jfs_error(ipimap->i_sb,
+                                  "diUpdatePMap: the inode is not free in the "
+                                  "persistent map");
+                        return -EIO;
+                }
+                /* update the bitmap for the extent of the allocated inode */
+                iagp->pmap[extno] |= cpu_to_le32(mask);
+        }
+        /*
+         * update iag lsn
+         */
+        lsn = tblk->lsn;
+        log = JFS_SBI(tblk->sb)->log;
+        if (mp->lsn != 0) {
+                /* inherit older/smaller lsn */
+                logdiff(difft, lsn, log);
+                logdiff(diffp, mp->lsn, log);
+                if (difft < diffp) {
+                        mp->lsn = lsn;
+                        /* move mp after tblock in logsync list */
+                        LOGSYNC_LOCK(log);
+                        list_move(&mp->synclist, &tblk->synclist);
+                        LOGSYNC_UNLOCK(log);
+                }
+                /* inherit younger/larger clsn */
+                LOGSYNC_LOCK(log);
+                assert(mp->clsn);
+                logdiff(difft, tblk->clsn, log);
+                logdiff(diffp, mp->clsn, log);
+                if (difft > diffp)
+                        mp->clsn = tblk->clsn;
+                LOGSYNC_UNLOCK(log);
+        } else {
+                mp->log = log;
+                mp->lsn = lsn;
+                /* insert mp after tblock in logsync list */
+                LOGSYNC_LOCK(log);
+                log->count++;
+                list_add(&mp->synclist, &tblk->synclist);
+                mp->clsn = tblk->clsn;
+                LOGSYNC_UNLOCK(log);
+        }
+        write_metapage(mp);
+        return (0);
+}
+/*
+ *      diExtendFS()
+ *
+ * function: update imap for extendfs();
+ * 
+ * note: AG size has been increased s.t. each k old contiguous AGs are 
+ * coalesced into a new AG;
+ */
+int diExtendFS(struct inode *ipimap, struct inode *ipbmap)
+{
+        int rc, rcx = 0;
+        struct inomap *imap = JFS_IP(ipimap)->i_imap;
+        struct iag *iagp = NULL, *hiagp = NULL;
+        struct bmap *mp = JFS_SBI(ipbmap->i_sb)->bmap;
+        struct metapage *bp, *hbp;
+        int i, n, head;
+        int numinos, xnuminos = 0, xnumfree = 0;
+        s64 agstart;
+        jfs_info("diExtendFS: nextiag:%d numinos:%d numfree:%d",
+                   imap->im_nextiag, atomic_read(&imap->im_numinos),
+                   atomic_read(&imap->im_numfree));
+        /*
+         *      reconstruct imap 
+         *
+         * coalesce contiguous k (newAGSize/oldAGSize) AGs;
+         * i.e., (AGi, ..., AGj) where i = k*n and j = k*(n+1) - 1 to AGn;
+         * note: new AG size = old AG size * (2**x).
+         */
+        /* init per AG control information im_agctl[] */
+        for (i = 0; i < MAXAG; i++) {
+                imap->im_agctl[i].inofree = -1;
+                imap->im_agctl[i].extfree = -1;
+                imap->im_agctl[i].numinos = 0;  /* number of backed inodes */
+                imap->im_agctl[i].numfree = 0;  /* number of free backed inodes */
+        }
+        /*
+         *      process each iag page of the map.
+         *
+         * rebuild AG Free Inode List, AG Free Inode Extent List;
+         */
+        for (i = 0; i < imap->im_nextiag; i++) {
+                if ((rc = diIAGRead(imap, i, &bp))) {
+                        rcx = rc;
+                        continue;
+                }
+                iagp = (struct iag *) bp->data;
+                if (le32_to_cpu(iagp->iagnum) != i) {
+                        release_metapage(bp);
+                        jfs_error(ipimap->i_sb,
+                                  "diExtendFs: unexpected value of iagnum");
+                        return -EIO;
+                }
+                /* leave free iag in the free iag list */
+                if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) {  
+                        release_metapage(bp);
+                        continue;
+                }
+                /* agstart that computes to the same ag is treated as same; */
+                agstart = le64_to_cpu(iagp->agstart);
+                /* iagp->agstart = agstart & ~(mp->db_agsize - 1); */
+                n = agstart >> mp->db_agl2size;
+                /* compute backed inodes */
+                numinos = (EXTSPERIAG - le32_to_cpu(iagp->nfreeexts))
+                    << L2INOSPEREXT;
+                if (numinos > 0) {
+                        /* merge AG backed inodes */
+                        imap->im_agctl[n].numinos += numinos;
+                        xnuminos += numinos;
+                }
+                /* if any backed free inodes, insert at AG free inode list */
+                if ((int) le32_to_cpu(iagp->nfreeinos) > 0) {
+                        if ((head = imap->im_agctl[n].inofree) == -1) {
+                                iagp->inofreefwd = cpu_to_le32(-1);
+                                iagp->inofreeback = cpu_to_le32(-1);
+                        } else {
+                                if ((rc = diIAGRead(imap, head, &hbp))) {
+                                        rcx = rc;
+                                        goto nextiag;
+                                }
+                                hiagp = (struct iag *) hbp->data;
+                                hiagp->inofreeback = iagp->iagnum;
+                                iagp->inofreefwd = cpu_to_le32(head);
+                                iagp->inofreeback = cpu_to_le32(-1);
+                                write_metapage(hbp);
+                        }
+                        imap->im_agctl[n].inofree =
+                            le32_to_cpu(iagp->iagnum);
+                        /* merge AG backed free inodes */
+                        imap->im_agctl[n].numfree +=
+                            le32_to_cpu(iagp->nfreeinos);
+                        xnumfree += le32_to_cpu(iagp->nfreeinos);
+                }
+                /* if any free extents, insert at AG free extent list */
+                if (le32_to_cpu(iagp->nfreeexts) > 0) {
+                        if ((head = imap->im_agctl[n].extfree) == -1) {
+                                iagp->extfreefwd = cpu_to_le32(-1);
+                                iagp->extfreeback = cpu_to_le32(-1);
+                        } else {
+                                if ((rc = diIAGRead(imap, head, &hbp))) {
+                                        rcx = rc;
+                                        goto nextiag;
+                                }
+                                hiagp = (struct iag *) hbp->data;
+                                hiagp->extfreeback = iagp->iagnum;
+                                iagp->extfreefwd = cpu_to_le32(head);
+                                iagp->extfreeback = cpu_to_le32(-1);
+                                write_metapage(hbp);
+                        }
+                        imap->im_agctl[n].extfree =
+                            le32_to_cpu(iagp->iagnum);
+                }
+              nextiag:
+                write_metapage(bp);
+        }
+        if (xnuminos != atomic_read(&imap->im_numinos) ||
+            xnumfree != atomic_read(&imap->im_numfree)) {
+                jfs_error(ipimap->i_sb,
+                          "diExtendFs: numinos or numfree incorrect");
+                return -EIO;
+        }
+        return rcx;
+}
+/*
+ *      duplicateIXtree()
+ *
+ * serialization: IWRITE_LOCK held on entry/exit
+ *
+ * note: shadow page with regular inode (rel.2);
+ */
+static void duplicateIXtree(struct super_block *sb, s64 blkno,
+                            int xlen, s64 *xaddr)
+{
+        struct jfs_superblock *j_sb;
+        struct buffer_head *bh;
+        struct inode *ip;
+        tid_t tid;
+        /* if AIT2 ipmap2 is bad, do not try to update it */
+        if (JFS_SBI(sb)->mntflag & JFS_BAD_SAIT)        /* s_flag */
+                return;
+        ip = diReadSpecial(sb, FILESYSTEM_I, 1);
+        if (ip == NULL) {
+                JFS_SBI(sb)->mntflag |= JFS_BAD_SAIT;
+                if (readSuper(sb, &bh))
+                        return;
+                j_sb = (struct jfs_superblock *)bh->b_data;
+                j_sb->s_flag |= cpu_to_le32(JFS_BAD_SAIT);
+                mark_buffer_dirty(bh);
+                sync_dirty_buffer(bh);
+                brelse(bh);
+                return;
+        }
+        /* start transaction */
+        tid = txBegin(sb, COMMIT_FORCE);
+        /* update the inode map addressing structure to point to it */
+        if (xtInsert(tid, ip, 0, blkno, xlen, xaddr, 0)) {
+                JFS_SBI(sb)->mntflag |= JFS_BAD_SAIT;
+                txAbort(tid, 1);
+                goto cleanup;
+        }
+        /* update the inode map's inode to reflect the extension */
+        ip->i_size += PSIZE;
+        inode_add_bytes(ip, PSIZE);
+        txCommit(tid, 1, &ip, COMMIT_FORCE);
+      cleanup:
+        txEnd(tid);
+        diFreeSpecial(ip);
+}
+/*
+ * NAME:        copy_from_dinode()
+ *
+ * FUNCTION:    Copies inode info from disk inode to in-memory inode
+ *
+ * RETURN VALUES:
+ *      0       - success
+ *      -ENOMEM - insufficient memory
+ */
+static int copy_from_dinode(struct dinode * dip, struct inode *ip)
+{
+        struct jfs_inode_info *jfs_ip = JFS_IP(ip);
+        jfs_ip->fileset = le32_to_cpu(dip->di_fileset);
+        jfs_ip->mode2 = le32_to_cpu(dip->di_mode);
+        ip->i_mode = le32_to_cpu(dip->di_mode) & 0xffff;
+        ip->i_nlink = le32_to_cpu(dip->di_nlink);
+        ip->i_uid = le32_to_cpu(dip->di_uid);
+        ip->i_gid = le32_to_cpu(dip->di_gid);
+        ip->i_size = le64_to_cpu(dip->di_size);
+        ip->i_atime.tv_sec = le32_to_cpu(dip->di_atime.tv_sec);
+        ip->i_atime.tv_nsec = le32_to_cpu(dip->di_atime.tv_nsec);
+        ip->i_mtime.tv_sec = le32_to_cpu(dip->di_mtime.tv_sec);
+        ip->i_mtime.tv_nsec = le32_to_cpu(dip->di_mtime.tv_nsec);
+        ip->i_ctime.tv_sec = le32_to_cpu(dip->di_ctime.tv_sec);
+        ip->i_ctime.tv_nsec = le32_to_cpu(dip->di_ctime.tv_nsec);
+        ip->i_blksize = ip->i_sb->s_blocksize;
+        ip->i_blocks = LBLK2PBLK(ip->i_sb, le64_to_cpu(dip->di_nblocks));
+        ip->i_generation = le32_to_cpu(dip->di_gen);
+        jfs_ip->ixpxd = dip->di_ixpxd;  /* in-memory pxd's are little-endian */
+        jfs_ip->acl = dip->di_acl;      /* as are dxd's */
+        jfs_ip->ea = dip->di_ea;
+        jfs_ip->next_index = le32_to_cpu(dip->di_next_index);
+        jfs_ip->otime = le32_to_cpu(dip->di_otime.tv_sec);
+        jfs_ip->acltype = le32_to_cpu(dip->di_acltype);
+        if (S_ISCHR(ip->i_mode) || S_ISBLK(ip->i_mode)) {
+                jfs_ip->dev = le32_to_cpu(dip->di_rdev);
+                ip->i_rdev = new_decode_dev(jfs_ip->dev);
+        }
+        if (S_ISDIR(ip->i_mode)) {
+                memcpy(&jfs_ip->i_dirtable, &dip->di_dirtable, 384);
+        } else if (S_ISREG(ip->i_mode) || S_ISLNK(ip->i_mode)) {
+                memcpy(&jfs_ip->i_xtroot, &dip->di_xtroot, 288);
+        } else
+                memcpy(&jfs_ip->i_inline_ea, &dip->di_inlineea, 128);
+        /* Zero the in-memory-only stuff */
+        jfs_ip->cflag = 0;
+        jfs_ip->btindex = 0;
+        jfs_ip->btorder = 0;
+        jfs_ip->bxflag = 0;
+        jfs_ip->blid = 0;
+        jfs_ip->atlhead = 0;
+        jfs_ip->atltail = 0;
+        jfs_ip->xtlid = 0;
+        return (0);
+}
+/*
+ * NAME:        copy_to_dinode()
+ *
+ * FUNCTION:    Copies inode info from in-memory inode to disk inode
+ */
+static void copy_to_dinode(struct dinode * dip, struct inode *ip)
+{
+        struct jfs_inode_info *jfs_ip = JFS_IP(ip);
+        dip->di_fileset = cpu_to_le32(jfs_ip->fileset);
+        dip->di_inostamp = cpu_to_le32(JFS_SBI(ip->i_sb)->inostamp);
+        dip->di_number = cpu_to_le32(ip->i_ino);
+        dip->di_gen = cpu_to_le32(ip->i_generation);
+        dip->di_size = cpu_to_le64(ip->i_size);
+        dip->di_nblocks = cpu_to_le64(PBLK2LBLK(ip->i_sb, ip->i_blocks));
+        dip->di_nlink = cpu_to_le32(ip->i_nlink);
+        dip->di_uid = cpu_to_le32(ip->i_uid);
+        dip->di_gid = cpu_to_le32(ip->i_gid);
+        /*
+         * mode2 is only needed for storing the higher order bits.
+         * Trust i_mode for the lower order ones
+         */
+        dip->di_mode = cpu_to_le32((jfs_ip->mode2 & 0xffff0000) | ip->i_mode);
+        dip->di_atime.tv_sec = cpu_to_le32(ip->i_atime.tv_sec);
+        dip->di_atime.tv_nsec = cpu_to_le32(ip->i_atime.tv_nsec);
+        dip->di_ctime.tv_sec = cpu_to_le32(ip->i_ctime.tv_sec);
+        dip->di_ctime.tv_nsec = cpu_to_le32(ip->i_ctime.tv_nsec);
+        dip->di_mtime.tv_sec = cpu_to_le32(ip->i_mtime.tv_sec);
+        dip->di_mtime.tv_nsec = cpu_to_le32(ip->i_mtime.tv_nsec);
+        dip->di_ixpxd = jfs_ip->ixpxd;  /* in-memory pxd's are little-endian */
+        dip->di_acl = jfs_ip->acl;      /* as are dxd's */
+        dip->di_ea = jfs_ip->ea;
+        dip->di_next_index = cpu_to_le32(jfs_ip->next_index);
+        dip->di_otime.tv_sec = cpu_to_le32(jfs_ip->otime);
+        dip->di_otime.tv_nsec = 0;
+        dip->di_acltype = cpu_to_le32(jfs_ip->acltype);
+        if (S_ISCHR(ip->i_mode) || S_ISBLK(ip->i_mode))
+                dip->di_rdev = cpu_to_le32(jfs_ip->dev);
+}
+#ifdef  _JFS_DEBUG_IMAP
+/*
+ *      DBGdiInit()
+ */
+static void *DBGdiInit(struct inomap * imap)
+{
+        u32 *dimap;
+        int size;
+        size = 64 * 1024;
+        if ((dimap = (u32 *) xmalloc(size, L2PSIZE, kernel_heap)) == NULL)
+                assert(0);
+        bzero((void *) dimap, size);
+        imap->im_DBGdimap = dimap;
+}
+/*
+ *      DBGdiAlloc()
+ */
+static void DBGdiAlloc(struct inomap * imap, ino_t ino)
+{
+        u32 *dimap = imap->im_DBGdimap;
+        int w, b;
+        u32 m;
+        w = ino >> 5;
+        b = ino & 31;
+        m = 0x80000000 >> b;
+        assert(w < 64 * 256);
+        if (dimap[w] & m) {
+                printk("DEBUG diAlloc: duplicate alloc ino:0x%x\n", ino);
+        }
+        dimap[w] |= m;
+}
+/*
+ *      DBGdiFree()
+ */
+static void DBGdiFree(struct inomap * imap, ino_t ino)
+{
+        u32 *dimap = imap->im_DBGdimap;
+        int w, b;
+        u32 m;
+        w = ino >> 5;
+        b = ino & 31;
+        m = 0x80000000 >> b;
+        assert(w < 64 * 256);
+        if ((dimap[w] & m) == 0) {
+                printk("DEBUG diFree: duplicate free ino:0x%x\n", ino);
+        }
+        dimap[w] &= ~m;
+}
+static void dump_cp(struct inomap * ipimap, char *function, int line)
+{
+        printk("\n* ********* *\nControl Page %s %d\n", function, line);
+        printk("FreeIAG %d\tNextIAG %d\n", ipimap->im_freeiag,
+               ipimap->im_nextiag);
+        printk("NumInos %d\tNumFree %d\n",
+               atomic_read(&ipimap->im_numinos),
+               atomic_read(&ipimap->im_numfree));
+        printk("AG InoFree %d\tAG ExtFree %d\n",
+               ipimap->im_agctl[0].inofree, ipimap->im_agctl[0].extfree);
+        printk("AG NumInos %d\tAG NumFree %d\n",
+               ipimap->im_agctl[0].numinos, ipimap->im_agctl[0].numfree);
+}
+static void dump_iag(struct iag * iag, char *function, int line)
+{
+        printk("\n* ********* *\nIAG %s %d\n", function, line);
+        printk("IagNum %d\tIAG Free %d\n", le32_to_cpu(iag->iagnum),
+               le32_to_cpu(iag->iagfree));
+        printk("InoFreeFwd %d\tInoFreeBack %d\n",
+               le32_to_cpu(iag->inofreefwd),
+               le32_to_cpu(iag->inofreeback));
+        printk("ExtFreeFwd %d\tExtFreeBack %d\n",
+               le32_to_cpu(iag->extfreefwd),
+               le32_to_cpu(iag->extfreeback));
+        printk("NFreeInos %d\tNFreeExts %d\n", le32_to_cpu(iag->nfreeinos),
+               le32_to_cpu(iag->nfreeexts));
+}
+#endif                          /* _JFS_DEBUG_IMAP */
diff --git a/fs/jfs/jfs_imap.h b/fs/jfs/jfs_imap.h
new file mode 100644
index 000000000000..6b59adec036a
--- /dev/null
+++ b/fs/jfs/jfs_imap.h
@@ -0,0 +1,175 @@
+/*
+ *   Copyright (c) International Business Machines Corp., 2000-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_IMAP
+#define _H_JFS_IMAP
+#include "jfs_txnmgr.h"
+/*
+ *      jfs_imap.h: disk inode manager
+ */
+#define EXTSPERIAG      128     /* number of disk inode extent per iag  */
+#define IMAPBLKNO       0       /* lblkno of dinomap within inode map   */
+#define SMAPSZ          4       /* number of words per summary map      */
+#define EXTSPERSUM      32      /* number of extents per summary map entry */
+#define L2EXTSPERSUM    5       /* l2 number of extents per summary map */
+#define PGSPERIEXT      4       /* number of 4K pages per dinode extent */
+#define MAXIAGS         ((1<<20)-1)     /* maximum number of iags       */
+#define MAXAG           128     /* maximum number of allocation groups  */
+#define AMAPSIZE      512       /* bytes in the IAG allocation maps */
+#define SMAPSIZE      16        /* bytes in the IAG summary maps */
+/* convert inode number to iag number */
+#define INOTOIAG(ino)   ((ino) >> L2INOSPERIAG)
+/* convert iag number to logical block number of the iag page */
+#define IAGTOLBLK(iagno,l2nbperpg)      (((iagno) + 1) << (l2nbperpg))
+/* get the starting block number of the 4K page of an inode extent
+ * that contains ino.
+ */
+#define INOPBLK(pxd,ino,l2nbperpg)      (addressPXD((pxd)) +            \
+        ((((ino) & (INOSPEREXT-1)) >> L2INOSPERPAGE) << (l2nbperpg)))
+/*
+ *      inode allocation map:
+ * 
+ * inode allocation map consists of 
+ * . the inode map control page and
+ * . inode allocation group pages (per 4096 inodes)
+ * which are addressed by standard JFS xtree.
+ */
+/*
+ *      inode allocation group page (per 4096 inodes of an AG)
+ */
+struct iag {
+        __le64 agstart;         /* 8: starting block of ag              */
+        __le32 iagnum;          /* 4: inode allocation group number     */
+        __le32 inofreefwd;      /* 4: ag inode free list forward        */
+        __le32 inofreeback;     /* 4: ag inode free list back           */
+        __le32 extfreefwd;      /* 4: ag inode extent free list forward */
+        __le32 extfreeback;     /* 4: ag inode extent free list back    */
+        __le32 iagfree;         /* 4: iag free list                     */
+        /* summary map: 1 bit per inode extent */
+        __le32 inosmap[SMAPSZ]; /* 16: sum map of mapwords w/ free inodes;
+                                 *      note: this indicates free and backed
+                                 *      inodes, if the extent is not backed the
+                                 *      value will be 1.  if the extent is
+                                 *      backed but all inodes are being used the
+                                 *      value will be 1.  if the extent is
+                                 *      backed but at least one of the inodes is
+                                 *      free the value will be 0.
+                                 */
+        __le32 extsmap[SMAPSZ]; /* 16: sum map of mapwords w/ free extents */
+        __le32 nfreeinos;               /* 4: number of free inodes             */
+        __le32 nfreeexts;               /* 4: number of free extents            */
+        /* (72) */
+        u8 pad[1976];           /* 1976: pad to 2048 bytes */
+        /* allocation bit map: 1 bit per inode (0 - free, 1 - allocated) */
+        __le32 wmap[EXTSPERIAG];        /* 512: working allocation map  */
+        __le32 pmap[EXTSPERIAG];        /* 512: persistent allocation map */
+        pxd_t inoext[EXTSPERIAG];       /* 1024: inode extent addresses */
+};                              /* (4096) */
+/*
+ *      per AG control information (in inode map control page)
+ */
+struct iagctl_disk {
+        __le32 inofree;         /* 4: free inode list anchor            */
+        __le32 extfree;         /* 4: free extent list anchor           */
+        __le32 numinos;         /* 4: number of backed inodes           */
+        __le32 numfree;         /* 4: number of free inodes             */
+};                              /* (16) */
+struct iagctl {
+        int inofree;            /* free inode list anchor            */
+        int extfree;            /* free extent list anchor           */
+        int numinos;            /* number of backed inodes           */
+        int numfree;            /* number of free inodes             */
+};
+/*
+ *      per fileset/aggregate inode map control page
+ */
+struct dinomap_disk {
+        __le32 in_freeiag;      /* 4: free iag list anchor     */
+        __le32 in_nextiag;      /* 4: next free iag number     */
+        __le32 in_numinos;      /* 4: num of backed inodes */
+        __le32 in_numfree;      /* 4: num of free backed inodes */
+        __le32 in_nbperiext;    /* 4: num of blocks per inode extent */
+        __le32 in_l2nbperiext;  /* 4: l2 of in_nbperiext */
+        __le32 in_diskblock;    /* 4: for standalone test driver  */
+        __le32 in_maxag;        /* 4: for standalone test driver  */
+        u8 pad[2016];           /* 2016: pad to 2048 */
+        struct iagctl_disk in_agctl[MAXAG]; /* 2048: AG control information */
+};                              /* (4096) */
+struct dinomap {
+        int in_freeiag;         /* free iag list anchor     */
+        int in_nextiag;         /* next free iag number     */
+        int in_numinos;         /* num of backed inodes */
+        int in_numfree;         /* num of free backed inodes */
+        int in_nbperiext;       /* num of blocks per inode extent */
+        int in_l2nbperiext;     /* l2 of in_nbperiext */
+        int in_diskblock;       /* for standalone test driver  */
+        int in_maxag;           /* for standalone test driver  */
+        struct iagctl in_agctl[MAXAG];  /* AG control information */
+};
+/*
+ *      In-core inode map control page
+ */
+struct inomap {
+        struct dinomap im_imap;         /* 4096: inode allocation control */
+        struct inode *im_ipimap;        /* 4: ptr to inode for imap   */
+        struct semaphore im_freelock;   /* 4: iag free list lock      */
+        struct semaphore im_aglock[MAXAG];      /* 512: per AG locks          */
+        u32 *im_DBGdimap;
+        atomic_t im_numinos;    /* num of backed inodes */
+        atomic_t im_numfree;    /* num of free backed inodes */
+};
+#define im_freeiag      im_imap.in_freeiag
+#define im_nextiag      im_imap.in_nextiag
+#define im_agctl        im_imap.in_agctl
+#define im_nbperiext    im_imap.in_nbperiext
+#define im_l2nbperiext  im_imap.in_l2nbperiext
+/* for standalone testdriver
+ */
+#define im_diskblock    im_imap.in_diskblock
+#define im_maxag        im_imap.in_maxag
+extern int diFree(struct inode *);
+extern int diAlloc(struct inode *, boolean_t, struct inode *);
+extern int diSync(struct inode *);
+/* external references */
+extern int diUpdatePMap(struct inode *ipimap, unsigned long inum,
+                        boolean_t is_free, struct tblock * tblk);
+extern int diExtendFS(struct inode *ipimap, struct inode *ipbmap);
+extern int diMount(struct inode *);
+extern int diUnmount(struct inode *, int);
+extern int diRead(struct inode *);
+extern struct inode *diReadSpecial(struct super_block *, ino_t, int);
+extern void diWriteSpecial(struct inode *, int);
+extern void diFreeSpecial(struct inode *);
+extern int diWrite(tid_t tid, struct inode *);
+#endif                          /* _H_JFS_IMAP */
diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h
new file mode 100644
index 000000000000..ebd77c1bed66
--- /dev/null
+++ b/fs/jfs/jfs_incore.h
@@ -0,0 +1,197 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *   Portions Copyright (C) Christoph Hellwig, 2001-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */ 
+#ifndef _H_JFS_INCORE
+#define _H_JFS_INCORE
+#include <linux/rwsem.h>
+#include <linux/slab.h>
+#include <linux/bitops.h>
+#include "jfs_types.h"
+#include "jfs_xtree.h"
+#include "jfs_dtree.h"
+/*
+ * JFS magic number
+ */
+#define JFS_SUPER_MAGIC 0x3153464a /* "JFS1" */
+/*
+ * JFS-private inode information
+ */
+struct jfs_inode_info {
+        int     fileset;        /* fileset number (always 16)*/
+        uint    mode2;          /* jfs-specific mode            */
+        pxd_t   ixpxd;          /* inode extent descriptor      */
+        dxd_t   acl;            /* dxd describing acl   */
+        dxd_t   ea;             /* dxd describing ea    */
+        time_t  otime;          /* time created */
+        uint    next_index;     /* next available directory entry index */
+        int     acltype;        /* Type of ACL  */
+        short   btorder;        /* access order */
+        short   btindex;        /* btpage entry index*/
+        struct inode *ipimap;   /* inode map                    */
+        long    cflag;          /* commit flags         */
+        u16     bxflag;         /* xflag of pseudo buffer?      */
+        unchar  agno;           /* ag number                    */
+        signed char active_ag;  /* ag currently allocating from */
+        lid_t   blid;           /* lid of pseudo buffer?        */
+        lid_t   atlhead;        /* anonymous tlock list head    */
+        lid_t   atltail;        /* anonymous tlock list tail    */
+        spinlock_t ag_lock;     /* protects active_ag           */
+        struct list_head anon_inode_list; /* inodes having anonymous txns */
+        /*
+         * rdwrlock serializes xtree between reads & writes and synchronizes
+         * changes to special inodes.  It's use would be redundant on
+         * directories since the i_sem taken in the VFS is sufficient.
+         */
+        struct rw_semaphore rdwrlock;
+        /*
+         * commit_sem serializes transaction processing on an inode.
+         * It must be taken after beginning a transaction (txBegin), since
+         * dirty inodes may be committed while a new transaction on the
+         * inode is blocked in txBegin or TxBeginAnon
+         */
+        struct semaphore commit_sem;
+        /* xattr_sem allows us to access the xattrs without taking i_sem */
+        struct rw_semaphore xattr_sem;
+        lid_t   xtlid;          /* lid of xtree lock on directory */
+#ifdef CONFIG_JFS_POSIX_ACL
+        struct posix_acl *i_acl;
+        struct posix_acl *i_default_acl;
+#endif
+        union {
+                struct {
+                        xtpage_t _xtroot;       /* 288: xtree root */
+                        struct inomap *_imap;   /* 4: inode map header  */
+                } file;
+                struct {
+                        struct dir_table_slot _table[12]; /* 96: dir index */
+                        dtroot_t _dtroot;       /* 288: dtree root */
+                } dir;
+                struct {
+                        unchar _unused[16];     /* 16: */
+                        dxd_t _dxd;             /* 16: */
+                        unchar _inline[128];    /* 128: inline symlink */
+                        /* _inline_ea may overlay the last part of
+                         * file._xtroot if maxentry = XTROOTINITSLOT
+                         */
+                        unchar _inline_ea[128]; /* 128: inline extended attr */
+                } link;
+        } u;
+        u32 dev;        /* will die when we get wide dev_t */
+        struct inode    vfs_inode;
+};
+#define i_xtroot u.file._xtroot
+#define i_imap u.file._imap
+#define i_dirtable u.dir._table
+#define i_dtroot u.dir._dtroot
+#define i_inline u.link._inline
+#define i_inline_ea u.link._inline_ea
+#define JFS_ACL_NOT_CACHED ((void *)-1)
+#define IREAD_LOCK(ip)          down_read(&JFS_IP(ip)->rdwrlock)
+#define IREAD_UNLOCK(ip)        up_read(&JFS_IP(ip)->rdwrlock)
+#define IWRITE_LOCK(ip)         down_write(&JFS_IP(ip)->rdwrlock)
+#define IWRITE_UNLOCK(ip)       up_write(&JFS_IP(ip)->rdwrlock)
+/*
+ * cflag
+ */
+enum cflags {
+        COMMIT_Nolink,          /* inode committed with zero link count */
+        COMMIT_Inlineea,        /* commit inode inline EA */
+        COMMIT_Freewmap,        /* free WMAP at iClose() */
+        COMMIT_Dirty,           /* Inode is really dirty */
+        COMMIT_Dirtable,        /* commit changes to di_dirtable */
+        COMMIT_Stale,           /* data extent is no longer valid */
+        COMMIT_Synclist,        /* metadata pages on group commit synclist */
+};
+#define set_cflag(flag, ip)     set_bit(flag, &(JFS_IP(ip)->cflag))
+#define clear_cflag(flag, ip)   clear_bit(flag, &(JFS_IP(ip)->cflag))
+#define test_cflag(flag, ip)    test_bit(flag, &(JFS_IP(ip)->cflag))
+#define test_and_clear_cflag(flag, ip) \
+        test_and_clear_bit(flag, &(JFS_IP(ip)->cflag))
+/*
+ * JFS-private superblock information.
+ */
+struct jfs_sb_info {
+        struct super_block *sb;         /* Point back to vfs super block */
+        unsigned long   mntflag;        /* aggregate attributes */
+        struct inode    *ipbmap;        /* block map inode              */
+        struct inode    *ipaimap;       /* aggregate inode map inode    */
+        struct inode    *ipaimap2;      /* secondary aimap inode        */
+        struct inode    *ipimap;        /* aggregate inode map inode    */
+        struct jfs_log  *log;           /* log                  */
+        struct list_head log_list;      /* volumes associated with a journal */
+        short           bsize;          /* logical block size   */
+        short           l2bsize;        /* log2 logical block size      */
+        short           nbperpage;      /* blocks per page              */
+        short           l2nbperpage;    /* log2 blocks per page */
+        short           l2niperblk;     /* log2 inodes per page */
+        dev_t           logdev;         /* external log device  */
+        uint            aggregate;      /* volume identifier in log record */
+        pxd_t           logpxd;         /* pxd describing log   */
+        pxd_t           fsckpxd;        /* pxd describing fsck wkspc */
+        pxd_t           ait2;           /* pxd describing AIT copy      */
+        char            uuid[16];       /* 128-bit uuid for volume      */
+        char            loguuid[16];    /* 128-bit uuid for log */
+        /*
+         * commit_state is used for synchronization of the jfs_commit
+         * threads.  It is protected by LAZY_LOCK().
+         */
+        int             commit_state;   /* commit state */
+        /* Formerly in ipimap */
+        uint            gengen;         /* inode generation generator*/
+        uint            inostamp;       /* shows inode belongs to fileset*/
+        /* Formerly in ipbmap */
+        struct bmap     *bmap;          /* incore bmap descriptor       */
+        struct nls_table *nls_tab;      /* current codepage             */
+        uint            state;          /* mount/recovery state */
+        unsigned long   flag;           /* mount time flags */
+        uint            p_state;        /* state prior to going no integrity */
+};
+/* jfs_sb_info commit_state */
+#define IN_LAZYCOMMIT 1
+static inline struct jfs_inode_info *JFS_IP(struct inode *inode)
+{
+        return list_entry(inode, struct jfs_inode_info, vfs_inode);
+}
+static inline int jfs_dirtable_inline(struct inode *inode)
+{
+        return (JFS_IP(inode)->next_index <= (MAX_INLINE_DIRTABLE_ENTRY + 1));
+}
+static inline struct jfs_sb_info *JFS_SBI(struct super_block *sb)
+{
+        return sb->s_fs_info;
+}
+static inline int isReadOnly(struct inode *inode)
+{
+        if (JFS_SBI(inode->i_sb)->log)
+                return 0;
+        return 1;
+}
+#endif /* _H_JFS_INCORE */
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
new file mode 100644
index 000000000000..84f2459b2191
--- /dev/null
+++ b/fs/jfs/jfs_inode.c
@@ -0,0 +1,104 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/fs.h>
+#include <linux/quotaops.h>
+#include "jfs_incore.h"
+#include "jfs_filsys.h"
+#include "jfs_imap.h"
+#include "jfs_dinode.h"
+#include "jfs_debug.h"
+/*
+ * NAME:        ialloc()
+ *
+ * FUNCTION:    Allocate a new inode
+ *
+ */
+struct inode *ialloc(struct inode *parent, umode_t mode)
+{
+        struct super_block *sb = parent->i_sb;
+        struct inode *inode;
+        struct jfs_inode_info *jfs_inode;
+        int rc;
+        inode = new_inode(sb);
+        if (!inode) {
+                jfs_warn("ialloc: new_inode returned NULL!");
+                return inode;
+        }
+        jfs_inode = JFS_IP(inode);
+        rc = diAlloc(parent, S_ISDIR(mode), inode);
+        if (rc) {
+                jfs_warn("ialloc: diAlloc returned %d!", rc);
+                make_bad_inode(inode);
+                iput(inode);
+                return NULL;
+        }
+        inode->i_uid = current->fsuid;
+        if (parent->i_mode & S_ISGID) {
+                inode->i_gid = parent->i_gid;
+                if (S_ISDIR(mode))
+                        mode |= S_ISGID;
+        } else
+                inode->i_gid = current->fsgid;
+        /*
+         * Allocate inode to quota.
+         */
+        if (DQUOT_ALLOC_INODE(inode)) {
+                DQUOT_DROP(inode);
+                inode->i_flags |= S_NOQUOTA;
+                inode->i_nlink = 0;
+                iput(inode);
+                return NULL;
+        }
+        inode->i_mode = mode;
+        if (S_ISDIR(mode))
+                jfs_inode->mode2 = IDIRECTORY | mode;
+        else
+                jfs_inode->mode2 = INLINEEA | ISPARSE | mode;
+        inode->i_blksize = sb->s_blocksize;
+        inode->i_blocks = 0;
+        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+        jfs_inode->otime = inode->i_ctime.tv_sec;
+        inode->i_generation = JFS_SBI(sb)->gengen++;
+        jfs_inode->cflag = 0;
+        /* Zero remaining fields */
+        memset(&jfs_inode->acl, 0, sizeof(dxd_t));
+        memset(&jfs_inode->ea, 0, sizeof(dxd_t));
+        jfs_inode->next_index = 0;
+        jfs_inode->acltype = 0;
+        jfs_inode->btorder = 0;
+        jfs_inode->btindex = 0;
+        jfs_inode->bxflag = 0;
+        jfs_inode->blid = 0;
+        jfs_inode->atlhead = 0;
+        jfs_inode->atltail = 0;
+        jfs_inode->xtlid = 0;
+        jfs_info("ialloc returns inode = 0x%p\n", inode);
+        return inode;
+}
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
new file mode 100644
index 000000000000..3df91fbfe781
--- /dev/null
+++ b/fs/jfs/jfs_inode.h
@@ -0,0 +1,23 @@
+/*
+ *   Copyright (c) International Business Machines Corp., 2000-2001
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_INODE
+#define _H_JFS_INODE
+extern struct inode *ialloc(struct inode *, umode_t);
+#endif                          /* _H_JFS_INODE */
diff --git a/fs/jfs/jfs_lock.h b/fs/jfs/jfs_lock.h
new file mode 100644
index 000000000000..10ad1d086685
--- /dev/null
+++ b/fs/jfs/jfs_lock.h
@@ -0,0 +1,51 @@
+/*
+ *   Copyright (c) International Business Machines Corp., 2000-2001
+ *   Portions Copyright (c) Christoph Hellwig, 2001-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_LOCK
+#define _H_JFS_LOCK
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+/*
+ *      jfs_lock.h
+ */
+/*
+ * Conditional sleep where condition is protected by spinlock
+ *
+ * lock_cmd and unlock_cmd take and release the spinlock
+ */
+#define __SLEEP_COND(wq, cond, lock_cmd, unlock_cmd)    \
+do {                                                    \
+        DECLARE_WAITQUEUE(__wait, current);             \
+                                                        \
+        add_wait_queue(&wq, &__wait);                   \
+        for (;;) {                                      \
+                set_current_state(TASK_UNINTERRUPTIBLE);\
+                if (cond)                               \
+                        break;                          \
+                unlock_cmd;                             \
+                schedule();                             \
+                lock_cmd;                               \
+        }                                               \
+        current->state = TASK_RUNNING;                  \
+        remove_wait_queue(&wq, &__wait);                \
+} while (0)
+#endif                          /* _H_JFS_LOCK */
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
new file mode 100644
index 000000000000..b6a6869ebb4f
--- /dev/null
+++ b/fs/jfs/jfs_logmgr.c
@@ -0,0 +1,2524 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *   Portions Copyright (C) Christoph Hellwig, 2001-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+/*
+ *      jfs_logmgr.c: log manager
+ *
+ * for related information, see transaction manager (jfs_txnmgr.c), and
+ * recovery manager (jfs_logredo.c).
+ *
+ * note: for detail, RTFS.
+ *
+ *      log buffer manager:
+ * special purpose buffer manager supporting log i/o requirements.
+ * per log serial pageout of logpage
+ * queuing i/o requests and redrive i/o at iodone
+ * maintain current logpage buffer
+ * no caching since append only
+ * appropriate jfs buffer cache buffers as needed
+ *
+ *      group commit:
+ * transactions which wrote COMMIT records in the same in-memory
+ * log page during the pageout of previous/current log page(s) are
+ * committed together by the pageout of the page.
+ *
+ *      TBD lazy commit:
+ * transactions are committed asynchronously when the log page
+ * containing it COMMIT is paged out when it becomes full;
+ *
+ *      serialization:
+ * . a per log lock serialize log write.
+ * . a per log lock serialize group commit.
+ * . a per log lock serialize log open/close;
+ *
+ *      TBD log integrity:
+ * careful-write (ping-pong) of last logpage to recover from crash
+ * in overwrite.
+ * detection of split (out-of-order) write of physical sectors
+ * of last logpage via timestamp at end of each sector
+ * with its mirror data array at trailer).
+ *
+ *      alternatives:
+ * lsn - 64-bit monotonically increasing integer vs
+ * 32-bit lspn and page eor.
+ */
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/interrupt.h>
+#include <linux/smp_lock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>          /* for sync_blockdev() */
+#include <linux/bio.h>
+#include <linux/suspend.h>
+#include <linux/delay.h>
+#include "jfs_incore.h"
+#include "jfs_filsys.h"
+#include "jfs_metapage.h"
+#include "jfs_txnmgr.h"
+#include "jfs_debug.h"
+/*
+ * lbuf's ready to be redriven.  Protected by log_redrive_lock (jfsIO thread)
+ */
+static struct lbuf *log_redrive_list;
+static DEFINE_SPINLOCK(log_redrive_lock);
+DECLARE_WAIT_QUEUE_HEAD(jfs_IO_thread_wait);
+/*
+ *      log read/write serialization (per log)
+ */
+#define LOG_LOCK_INIT(log)      init_MUTEX(&(log)->loglock)
+#define LOG_LOCK(log)           down(&((log)->loglock))
+#define LOG_UNLOCK(log)         up(&((log)->loglock))
+/*
+ *      log group commit serialization (per log)
+ */
+#define LOGGC_LOCK_INIT(log)    spin_lock_init(&(log)->gclock)
+#define LOGGC_LOCK(log)         spin_lock_irq(&(log)->gclock)
+#define LOGGC_UNLOCK(log)       spin_unlock_irq(&(log)->gclock)
+#define LOGGC_WAKEUP(tblk)      wake_up_all(&(tblk)->gcwait)
+/*
+ *      log sync serialization (per log)
+ */
+#define LOGSYNC_DELTA(logsize)          min((logsize)/8, 128*LOGPSIZE)
+#define LOGSYNC_BARRIER(logsize)        ((logsize)/4)
+/*
+#define LOGSYNC_DELTA(logsize)          min((logsize)/4, 256*LOGPSIZE)
+#define LOGSYNC_BARRIER(logsize)        ((logsize)/2)
+*/
+/*
+ *      log buffer cache synchronization
+ */
+static DEFINE_SPINLOCK(jfsLCacheLock);
+#define LCACHE_LOCK(flags)      spin_lock_irqsave(&jfsLCacheLock, flags)
+#define LCACHE_UNLOCK(flags)    spin_unlock_irqrestore(&jfsLCacheLock, flags)
+/*
+ * See __SLEEP_COND in jfs_locks.h
+ */
+#define LCACHE_SLEEP_COND(wq, cond, flags)      \
+do {                                            \
+        if (cond)                               \
+                break;                          \
+        __SLEEP_COND(wq, cond, LCACHE_LOCK(flags), LCACHE_UNLOCK(flags)); \
+} while (0)
+#define LCACHE_WAKEUP(event)    wake_up(event)
+/*
+ *      lbuf buffer cache (lCache) control
+ */
+/* log buffer manager pageout control (cumulative, inclusive) */
+#define lbmREAD         0x0001
+#define lbmWRITE        0x0002  /* enqueue at tail of write queue;
+                                 * init pageout if at head of queue;
+                                 */
+#define lbmRELEASE      0x0004  /* remove from write queue
+                                 * at completion of pageout;
+                                 * do not free/recycle it yet:
+                                 * caller will free it;
+                                 */
+#define lbmSYNC         0x0008  /* do not return to freelist
+                                 * when removed from write queue;
+                                 */
+#define lbmFREE         0x0010  /* return to freelist
+                                 * at completion of pageout;
+                                 * the buffer may be recycled;
+                                 */
+#define lbmDONE         0x0020
+#define lbmERROR        0x0040
+#define lbmGC           0x0080  /* lbmIODone to perform post-GC processing
+                                 * of log page
+                                 */
+#define lbmDIRECT       0x0100
+/*
+ * Global list of active external journals
+ */
+static LIST_HEAD(jfs_external_logs);
+static struct jfs_log *dummy_log = NULL;
+static DECLARE_MUTEX(jfs_log_sem);
+/*
+ * external references
+ */
+extern void txLazyUnlock(struct tblock * tblk);
+extern int jfs_stop_threads;
+extern struct completion jfsIOwait;
+extern int jfs_tlocks_low;
+/*
+ * forward references
+ */
+static int lmWriteRecord(struct jfs_log * log, struct tblock * tblk,
+                         struct lrd * lrd, struct tlock * tlck);
+static int lmNextPage(struct jfs_log * log);
+static int lmLogFileSystem(struct jfs_log * log, struct jfs_sb_info *sbi,
+                           int activate);
+static int open_inline_log(struct super_block *sb);
+static int open_dummy_log(struct super_block *sb);
+static int lbmLogInit(struct jfs_log * log);
+static void lbmLogShutdown(struct jfs_log * log);
+static struct lbuf *lbmAllocate(struct jfs_log * log, int);
+static void lbmFree(struct lbuf * bp);
+static void lbmfree(struct lbuf * bp);
+static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp);
+static void lbmWrite(struct jfs_log * log, struct lbuf * bp, int flag, int cant_block);
+static void lbmDirectWrite(struct jfs_log * log, struct lbuf * bp, int flag);
+static int lbmIOWait(struct lbuf * bp, int flag);
+static bio_end_io_t lbmIODone;
+static void lbmStartIO(struct lbuf * bp);
+static void lmGCwrite(struct jfs_log * log, int cant_block);
+static int lmLogSync(struct jfs_log * log, int nosyncwait);
+/*
+ *      statistics
+ */
+#ifdef CONFIG_JFS_STATISTICS
+static struct lmStat {
+        uint commit;            /* # of commit */
+        uint pagedone;          /* # of page written */
+        uint submitted;         /* # of pages submitted */
+        uint full_page;         /* # of full pages submitted */
+        uint partial_page;      /* # of partial pages submitted */
+} lmStat;
+#endif
+/*
+ * NAME:        lmLog()
+ *
+ * FUNCTION:    write a log record;
+ *
+ * PARAMETER:
+ *
+ * RETURN:      lsn - offset to the next log record to write (end-of-log);
+ *              -1  - error;
+ *
+ * note: todo: log error handler
+ */
+int lmLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
+          struct tlock * tlck)
+{
+        int lsn;
+        int diffp, difft;
+        struct metapage *mp = NULL;
+        jfs_info("lmLog: log:0x%p tblk:0x%p, lrd:0x%p tlck:0x%p",
+                 log, tblk, lrd, tlck);
+        LOG_LOCK(log);
+        /* log by (out-of-transaction) JFS ? */
+        if (tblk == NULL)
+                goto writeRecord;
+        /* log from page ? */
+        if (tlck == NULL ||
+            tlck->type & tlckBTROOT || (mp = tlck->mp) == NULL)
+                goto writeRecord;
+        /*
+         *      initialize/update page/transaction recovery lsn
+         */
+        lsn = log->lsn;
+        LOGSYNC_LOCK(log);
+        /*
+         * initialize page lsn if first log write of the page
+         */
+        if (mp->lsn == 0) {
+                mp->log = log;
+                mp->lsn = lsn;
+                log->count++;
+                /* insert page at tail of logsynclist */
+                list_add_tail(&mp->synclist, &log->synclist);
+        }
+        /*
+         *      initialize/update lsn of tblock of the page
+         *
+         * transaction inherits oldest lsn of pages associated
+         * with allocation/deallocation of resources (their
+         * log records are used to reconstruct allocation map
+         * at recovery time: inode for inode allocation map,
+         * B+-tree index of extent descriptors for block
+         * allocation map);
+         * allocation map pages inherit transaction lsn at
+         * commit time to allow forwarding log syncpt past log
+         * records associated with allocation/deallocation of
+         * resources only after persistent map of these map pages
+         * have been updated and propagated to home.
+         */
+        /*
+         * initialize transaction lsn:
+         */
+        if (tblk->lsn == 0) {
+                /* inherit lsn of its first page logged */
+                tblk->lsn = mp->lsn;
+                log->count++;
+                /* insert tblock after the page on logsynclist */
+                list_add(&tblk->synclist, &mp->synclist);
+        }
+        /*
+         * update transaction lsn:
+         */
+        else {
+                /* inherit oldest/smallest lsn of page */
+                logdiff(diffp, mp->lsn, log);
+                logdiff(difft, tblk->lsn, log);
+                if (diffp < difft) {
+                        /* update tblock lsn with page lsn */
+                        tblk->lsn = mp->lsn;
+                        /* move tblock after page on logsynclist */
+                        list_move(&tblk->synclist, &mp->synclist);
+                }
+        }
+        LOGSYNC_UNLOCK(log);
+        /*
+         *      write the log record
+         */
+      writeRecord:
+        lsn = lmWriteRecord(log, tblk, lrd, tlck);
+        /*
+         * forward log syncpt if log reached next syncpt trigger
+         */
+        logdiff(diffp, lsn, log);
+        if (diffp >= log->nextsync)
+                lsn = lmLogSync(log, 0);
+        /* update end-of-log lsn */
+        log->lsn = lsn;
+        LOG_UNLOCK(log);
+        /* return end-of-log address */
+        return lsn;
+}
+/*
+ * NAME:        lmWriteRecord()
+ *
+ * FUNCTION:    move the log record to current log page
+ *
+ * PARAMETER:   cd      - commit descriptor
+ *
+ * RETURN:      end-of-log address
+ *                      
+ * serialization: LOG_LOCK() held on entry/exit
+ */
+static int
+lmWriteRecord(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
+              struct tlock * tlck)
+{
+        int lsn = 0;            /* end-of-log address */
+        struct lbuf *bp;        /* dst log page buffer */
+        struct logpage *lp;     /* dst log page */
+        caddr_t dst;            /* destination address in log page */
+        int dstoffset;          /* end-of-log offset in log page */
+        int freespace;          /* free space in log page */
+        caddr_t p;              /* src meta-data page */
+        caddr_t src;
+        int srclen;
+        int nbytes;             /* number of bytes to move */
+        int i;
+        int len;
+        struct linelock *linelock;
+        struct lv *lv;
+        struct lvd *lvd;
+        int l2linesize;
+        len = 0;
+        /* retrieve destination log page to write */
+        bp = (struct lbuf *) log->bp;
+        lp = (struct logpage *) bp->l_ldata;
+        dstoffset = log->eor;
+        /* any log data to write ? */
+        if (tlck == NULL)
+                goto moveLrd;
+        /*
+         *      move log record data
+         */
+        /* retrieve source meta-data page to log */
+        if (tlck->flag & tlckPAGELOCK) {
+                p = (caddr_t) (tlck->mp->data);
+                linelock = (struct linelock *) & tlck->lock;
+        }
+        /* retrieve source in-memory inode to log */
+        else if (tlck->flag & tlckINODELOCK) {
+                if (tlck->type & tlckDTREE)
+                        p = (caddr_t) &JFS_IP(tlck->ip)->i_dtroot;
+                else
+                        p = (caddr_t) &JFS_IP(tlck->ip)->i_xtroot;
+                linelock = (struct linelock *) & tlck->lock;
+        }
+#ifdef  _JFS_WIP
+        else if (tlck->flag & tlckINLINELOCK) {
+                inlinelock = (struct inlinelock *) & tlck;
+                p = (caddr_t) & inlinelock->pxd;
+                linelock = (struct linelock *) & tlck;
+        }
+#endif                          /* _JFS_WIP */
+        else {
+                jfs_err("lmWriteRecord: UFO tlck:0x%p", tlck);
+                return 0;       /* Probably should trap */
+        }
+        l2linesize = linelock->l2linesize;
+      moveData:
+        ASSERT(linelock->index <= linelock->maxcnt);
+        lv = linelock->lv;
+        for (i = 0; i < linelock->index; i++, lv++) {
+                if (lv->length == 0)
+                        continue;
+                /* is page full ? */
+                if (dstoffset >= LOGPSIZE - LOGPTLRSIZE) {
+                        /* page become full: move on to next page */
+                        lmNextPage(log);
+                        bp = log->bp;
+                        lp = (struct logpage *) bp->l_ldata;
+                        dstoffset = LOGPHDRSIZE;
+                }
+                /*
+                 * move log vector data
+                 */
+                src = (u8 *) p + (lv->offset << l2linesize);
+                srclen = lv->length << l2linesize;
+                len += srclen;
+                while (srclen > 0) {
+                        freespace = (LOGPSIZE - LOGPTLRSIZE) - dstoffset;
+                        nbytes = min(freespace, srclen);
+                        dst = (caddr_t) lp + dstoffset;
+                        memcpy(dst, src, nbytes);
+                        dstoffset += nbytes;
+                        /* is page not full ? */
+                        if (dstoffset < LOGPSIZE - LOGPTLRSIZE)
+                                break;
+                        /* page become full: move on to next page */
+                        lmNextPage(log);
+                        bp = (struct lbuf *) log->bp;
+                        lp = (struct logpage *) bp->l_ldata;
+                        dstoffset = LOGPHDRSIZE;
+                        srclen -= nbytes;
+                        src += nbytes;
+                }
+                /*
+                 * move log vector descriptor
+                 */
+                len += 4;
+                lvd = (struct lvd *) ((caddr_t) lp + dstoffset);
+                lvd->offset = cpu_to_le16(lv->offset);
+                lvd->length = cpu_to_le16(lv->length);
+                dstoffset += 4;
+                jfs_info("lmWriteRecord: lv offset:%d length:%d",
+                         lv->offset, lv->length);
+        }
+        if ((i = linelock->next)) {
+                linelock = (struct linelock *) lid_to_tlock(i);
+                goto moveData;
+        }
+        /*
+         *      move log record descriptor
+         */
+      moveLrd:
+        lrd->length = cpu_to_le16(len);
+        src = (caddr_t) lrd;
+        srclen = LOGRDSIZE;
+        while (srclen > 0) {
+                freespace = (LOGPSIZE - LOGPTLRSIZE) - dstoffset;
+                nbytes = min(freespace, srclen);
+                dst = (caddr_t) lp + dstoffset;
+                memcpy(dst, src, nbytes);
+                dstoffset += nbytes;
+                srclen -= nbytes;
+                /* are there more to move than freespace of page ? */
+                if (srclen)
+                        goto pageFull;
+                /*
+                 * end of log record descriptor
+                 */
+                /* update last log record eor */
+                log->eor = dstoffset;
+                bp->l_eor = dstoffset;
+                lsn = (log->page << L2LOGPSIZE) + dstoffset;
+                if (lrd->type & cpu_to_le16(LOG_COMMIT)) {
+                        tblk->clsn = lsn;
+                        jfs_info("wr: tclsn:0x%x, beor:0x%x", tblk->clsn,
+                                 bp->l_eor);
+                        INCREMENT(lmStat.commit);       /* # of commit */
+                        /*
+                         * enqueue tblock for group commit:
+                         *
+                         * enqueue tblock of non-trivial/synchronous COMMIT
+                         * at tail of group commit queue
+                         * (trivial/asynchronous COMMITs are ignored by
+                         * group commit.)
+                         */
+                        LOGGC_LOCK(log);
+                        /* init tblock gc state */
+                        tblk->flag = tblkGC_QUEUE;
+                        tblk->bp = log->bp;
+                        tblk->pn = log->page;
+                        tblk->eor = log->eor;
+                        /* enqueue transaction to commit queue */
+                        list_add_tail(&tblk->cqueue, &log->cqueue);
+                        LOGGC_UNLOCK(log);
+                }
+                jfs_info("lmWriteRecord: lrd:0x%04x bp:0x%p pn:%d eor:0x%x",
+                        le16_to_cpu(lrd->type), log->bp, log->page, dstoffset);
+                /* page not full ? */
+                if (dstoffset < LOGPSIZE - LOGPTLRSIZE)
+                        return lsn;
+              pageFull:
+                /* page become full: move on to next page */
+                lmNextPage(log);
+                bp = (struct lbuf *) log->bp;
+                lp = (struct logpage *) bp->l_ldata;
+                dstoffset = LOGPHDRSIZE;
+                src += nbytes;
+        }
+        return lsn;
+}
+/*
+ * NAME:        lmNextPage()
+ *
+ * FUNCTION:    write current page and allocate next page.
+ *
+ * PARAMETER:   log
+ *
+ * RETURN:      0
+ *                      
+ * serialization: LOG_LOCK() held on entry/exit
+ */
+static int lmNextPage(struct jfs_log * log)
+{
+        struct logpage *lp;
+        int lspn;               /* log sequence page number */
+        int pn;                 /* current page number */
+        struct lbuf *bp;
+        struct lbuf *nextbp;
+        struct tblock *tblk;
+        /* get current log page number and log sequence page number */
+        pn = log->page;
+        bp = log->bp;
+        lp = (struct logpage *) bp->l_ldata;
+        lspn = le32_to_cpu(lp->h.page);
+        LOGGC_LOCK(log);
+        /*
+         *      write or queue the full page at the tail of write queue
+         */
+        /* get the tail tblk on commit queue */
+        if (list_empty(&log->cqueue))
+                tblk = NULL;
+        else
+                tblk = list_entry(log->cqueue.prev, struct tblock, cqueue);
+        /* every tblk who has COMMIT record on the current page,
+         * and has not been committed, must be on commit queue
+         * since tblk is queued at commit queueu at the time
+         * of writing its COMMIT record on the page before
+         * page becomes full (even though the tblk thread
+         * who wrote COMMIT record may have been suspended
+         * currently);
+         */
+        /* is page bound with outstanding tail tblk ? */
+        if (tblk && tblk->pn == pn) {
+                /* mark tblk for end-of-page */
+                tblk->flag |= tblkGC_EOP;
+                if (log->cflag & logGC_PAGEOUT) {
+                        /* if page is not already on write queue,
+                         * just enqueue (no lbmWRITE to prevent redrive)
+                         * buffer to wqueue to ensure correct serial order
+                         * of the pages since log pages will be added
+                         * continuously
+                         */
+                        if (bp->l_wqnext == NULL)
+                                lbmWrite(log, bp, 0, 0);
+                } else {
+                        /*
+                         * No current GC leader, initiate group commit
+                         */
+                        log->cflag |= logGC_PAGEOUT;
+                        lmGCwrite(log, 0);
+                }
+        }
+        /* page is not bound with outstanding tblk:
+         * init write or mark it to be redriven (lbmWRITE)
+         */
+        else {
+                /* finalize the page */
+                bp->l_ceor = bp->l_eor;
+                lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor);
+                lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmFREE, 0);
+        }
+        LOGGC_UNLOCK(log);
+        /*
+         *      allocate/initialize next page
+         */
+        /* if log wraps, the first data page of log is 2
+         * (0 never used, 1 is superblock).
+         */
+        log->page = (pn == log->size - 1) ? 2 : pn + 1;
+        log->eor = LOGPHDRSIZE; /* ? valid page empty/full at logRedo() */
+        /* allocate/initialize next log page buffer */
+        nextbp = lbmAllocate(log, log->page);
+        nextbp->l_eor = log->eor;
+        log->bp = nextbp;
+        /* initialize next log page */
+        lp = (struct logpage *) nextbp->l_ldata;
+        lp->h.page = lp->t.page = cpu_to_le32(lspn + 1);
+        lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE);
+        return 0;
+}
+/*
+ * NAME:        lmGroupCommit()
+ *
+ * FUNCTION:    group commit
+ *      initiate pageout of the pages with COMMIT in the order of
+ *      page number - redrive pageout of the page at the head of
+ *      pageout queue until full page has been written.
+ *
+ * RETURN:      
+ *
+ * NOTE:
+ *      LOGGC_LOCK serializes log group commit queue, and
+ *      transaction blocks on the commit queue.
+ *      N.B. LOG_LOCK is NOT held during lmGroupCommit().
+ */
+int lmGroupCommit(struct jfs_log * log, struct tblock * tblk)
+{
+        int rc = 0;
+        LOGGC_LOCK(log);
+        /* group committed already ? */
+        if (tblk->flag & tblkGC_COMMITTED) {
+                if (tblk->flag & tblkGC_ERROR)
+                        rc = -EIO;
+                LOGGC_UNLOCK(log);
+                return rc;
+        }
+        jfs_info("lmGroup Commit: tblk = 0x%p, gcrtc = %d", tblk, log->gcrtc);
+        if (tblk->xflag & COMMIT_LAZY)
+                tblk->flag |= tblkGC_LAZY;
+        if ((!(log->cflag & logGC_PAGEOUT)) && (!list_empty(&log->cqueue)) &&
+            (!(tblk->xflag & COMMIT_LAZY) || test_bit(log_FLUSH, &log->flag)
+             || jfs_tlocks_low)) {
+                /*
+                 * No pageout in progress
+                 *
+                 * start group commit as its group leader.
+                 */
+                log->cflag |= logGC_PAGEOUT;
+                lmGCwrite(log, 0);
+        }
+        if (tblk->xflag & COMMIT_LAZY) {
+                /*
+                 * Lazy transactions can leave now
+                 */
+                LOGGC_UNLOCK(log);
+                return 0;
+        }
+        /* lmGCwrite gives up LOGGC_LOCK, check again */
+        if (tblk->flag & tblkGC_COMMITTED) {
+                if (tblk->flag & tblkGC_ERROR)
+                        rc = -EIO;
+                LOGGC_UNLOCK(log);
+                return rc;
+        }
+        /* upcount transaction waiting for completion
+         */
+        log->gcrtc++;
+        tblk->flag |= tblkGC_READY;
+        __SLEEP_COND(tblk->gcwait, (tblk->flag & tblkGC_COMMITTED),
+                     LOGGC_LOCK(log), LOGGC_UNLOCK(log));
+        /* removed from commit queue */
+        if (tblk->flag & tblkGC_ERROR)
+                rc = -EIO;
+        LOGGC_UNLOCK(log);
+        return rc;
+}
+/*
+ * NAME:        lmGCwrite()
+ *
+ * FUNCTION:    group commit write
+ *      initiate write of log page, building a group of all transactions
+ *      with commit records on that page.
+ *
+ * RETURN:      None
+ *
+ * NOTE:
+ *      LOGGC_LOCK must be held by caller.
+ *      N.B. LOG_LOCK is NOT held during lmGroupCommit().
+ */
+static void lmGCwrite(struct jfs_log * log, int cant_write)
+{
+        struct lbuf *bp;
+        struct logpage *lp;
+        int gcpn;               /* group commit page number */
+        struct tblock *tblk;
+        struct tblock *xtblk = NULL;
+        /*
+         * build the commit group of a log page
+         *
+         * scan commit queue and make a commit group of all
+         * transactions with COMMIT records on the same log page.
+         */
+        /* get the head tblk on the commit queue */
+        gcpn = list_entry(log->cqueue.next, struct tblock, cqueue)->pn;
+        list_for_each_entry(tblk, &log->cqueue, cqueue) {
+                if (tblk->pn != gcpn)
+                        break;
+                xtblk = tblk;
+                /* state transition: (QUEUE, READY) -> COMMIT */
+                tblk->flag |= tblkGC_COMMIT;
+        }
+        tblk = xtblk;           /* last tblk of the page */
+        /*
+         * pageout to commit transactions on the log page.
+         */
+        bp = (struct lbuf *) tblk->bp;
+        lp = (struct logpage *) bp->l_ldata;
+        /* is page already full ? */
+        if (tblk->flag & tblkGC_EOP) {
+                /* mark page to free at end of group commit of the page */
+                tblk->flag &= ~tblkGC_EOP;
+                tblk->flag |= tblkGC_FREE;
+                bp->l_ceor = bp->l_eor;
+                lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor);
+                lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmGC,
+                         cant_write);
+                INCREMENT(lmStat.full_page);
+        }
+        /* page is not yet full */
+        else {
+                bp->l_ceor = tblk->eor; /* ? bp->l_ceor = bp->l_eor; */
+                lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor);
+                lbmWrite(log, bp, lbmWRITE | lbmGC, cant_write);
+                INCREMENT(lmStat.partial_page);
+        }
+}
+/*
+ * NAME:        lmPostGC()
+ *
+ * FUNCTION:    group commit post-processing
+ *      Processes transactions after their commit records have been written
+ *      to disk, redriving log I/O if necessary.
+ *
+ * RETURN:      None
+ *
+ * NOTE:
+ *      This routine is called a interrupt time by lbmIODone
+ */
+static void lmPostGC(struct lbuf * bp)
+{
+        unsigned long flags;
+        struct jfs_log *log = bp->l_log;
+        struct logpage *lp;
+        struct tblock *tblk, *temp;
+        //LOGGC_LOCK(log);
+        spin_lock_irqsave(&log->gclock, flags);
+        /*
+         * current pageout of group commit completed.
+         *
+         * remove/wakeup transactions from commit queue who were
+         * group committed with the current log page
+         */
+        list_for_each_entry_safe(tblk, temp, &log->cqueue, cqueue) {
+                if (!(tblk->flag & tblkGC_COMMIT))
+                        break;
+                /* if transaction was marked GC_COMMIT then
+                 * it has been shipped in the current pageout
+                 * and made it to disk - it is committed.
+                 */
+                if (bp->l_flag & lbmERROR)
+                        tblk->flag |= tblkGC_ERROR;
+                /* remove it from the commit queue */
+                list_del(&tblk->cqueue);
+                tblk->flag &= ~tblkGC_QUEUE;
+                if (tblk == log->flush_tblk) {
+                        /* we can stop flushing the log now */
+                        clear_bit(log_FLUSH, &log->flag);
+                        log->flush_tblk = NULL;
+                }
+                jfs_info("lmPostGC: tblk = 0x%p, flag = 0x%x", tblk,
+                         tblk->flag);
+                if (!(tblk->xflag & COMMIT_FORCE))
+                        /*
+                         * Hand tblk over to lazy commit thread
+                         */
+                        txLazyUnlock(tblk);
+                else {
+                        /* state transition: COMMIT -> COMMITTED */
+                        tblk->flag |= tblkGC_COMMITTED;
+                        if (tblk->flag & tblkGC_READY)
+                                log->gcrtc--;
+                        LOGGC_WAKEUP(tblk);
+                }
+                /* was page full before pageout ?
+                 * (and this is the last tblk bound with the page)
+                 */
+                if (tblk->flag & tblkGC_FREE)
+                        lbmFree(bp);
+                /* did page become full after pageout ?
+                 * (and this is the last tblk bound with the page)
+                 */
+                else if (tblk->flag & tblkGC_EOP) {
+                        /* finalize the page */
+                        lp = (struct logpage *) bp->l_ldata;
+                        bp->l_ceor = bp->l_eor;
+                        lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor);
+                        jfs_info("lmPostGC: calling lbmWrite");
+                        lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmFREE,
+                                 1);
+                }
+        }
+        /* are there any transactions who have entered lnGroupCommit()
+         * (whose COMMITs are after that of the last log page written.
+         * They are waiting for new group commit (above at (SLEEP 1))
+         * or lazy transactions are on a full (queued) log page,
+         * select the latest ready transaction as new group leader and
+         * wake her up to lead her group.
+         */
+        if ((!list_empty(&log->cqueue)) &&
+            ((log->gcrtc > 0) || (tblk->bp->l_wqnext != NULL) ||
+             test_bit(log_FLUSH, &log->flag) || jfs_tlocks_low))
+                /*
+                 * Call lmGCwrite with new group leader
+                 */
+                lmGCwrite(log, 1);
+        /* no transaction are ready yet (transactions are only just
+         * queued (GC_QUEUE) and not entered for group commit yet).
+         * the first transaction entering group commit
+         * will elect herself as new group leader.
+         */
+        else
+                log->cflag &= ~logGC_PAGEOUT;
+        //LOGGC_UNLOCK(log);
+        spin_unlock_irqrestore(&log->gclock, flags);
+        return;
+}
+/*
+ * NAME:        lmLogSync()
+ *
+ * FUNCTION:    write log SYNCPT record for specified log
+ *      if new sync address is available
+ *      (normally the case if sync() is executed by back-ground
+ *      process).
+ *      if not, explicitly run jfs_blogsync() to initiate
+ *      getting of new sync address.
+ *      calculate new value of i_nextsync which determines when
+ *      this code is called again.
+ *
+ *      this is called only from lmLog().
+ *
+ * PARAMETER:   ip      - pointer to logs inode.
+ *
+ * RETURN:      0
+ *                      
+ * serialization: LOG_LOCK() held on entry/exit
+ */
+static int lmLogSync(struct jfs_log * log, int nosyncwait)
+{
+        int logsize;
+        int written;            /* written since last syncpt */
+        int free;               /* free space left available */
+        int delta;              /* additional delta to write normally */
+        int more;               /* additional write granted */
+        struct lrd lrd;
+        int lsn;
+        struct logsyncblk *lp;
+        /*
+         *      forward syncpt
+         */
+        /* if last sync is same as last syncpt,
+         * invoke sync point forward processing to update sync.
+         */
+        if (log->sync == log->syncpt) {
+                LOGSYNC_LOCK(log);
+                /* ToDo: push dirty metapages out to disk */
+//              bmLogSync(log);
+                if (list_empty(&log->synclist))
+                        log->sync = log->lsn;
+                else {
+                        lp = list_entry(log->synclist.next,
+                                        struct logsyncblk, synclist);
+                        log->sync = lp->lsn;
+                }
+                LOGSYNC_UNLOCK(log);
+        }
+        /* if sync is different from last syncpt,
+         * write a SYNCPT record with syncpt = sync.
+         * reset syncpt = sync
+         */
+        if (log->sync != log->syncpt) {
+                struct jfs_sb_info *sbi;
+                /*
+                 * We need to make sure all of the "written" metapages
+                 * actually make it to disk
+                 */
+                list_for_each_entry(sbi, &log->sb_list, log_list) {
+                        if (sbi->flag & JFS_NOINTEGRITY)
+                                continue;
+                        filemap_fdatawrite(sbi->ipbmap->i_mapping);
+                        filemap_fdatawrite(sbi->ipimap->i_mapping);
+                        filemap_fdatawrite(sbi->sb->s_bdev->bd_inode->i_mapping);
+                }
+                list_for_each_entry(sbi, &log->sb_list, log_list) {
+                        if (sbi->flag & JFS_NOINTEGRITY)
+                                continue;
+                        filemap_fdatawait(sbi->ipbmap->i_mapping);
+                        filemap_fdatawait(sbi->ipimap->i_mapping);
+                        filemap_fdatawait(sbi->sb->s_bdev->bd_inode->i_mapping);
+                }
+                lrd.logtid = 0;
+                lrd.backchain = 0;
+                lrd.type = cpu_to_le16(LOG_SYNCPT);
+                lrd.length = 0;
+                lrd.log.syncpt.sync = cpu_to_le32(log->sync);
+                lsn = lmWriteRecord(log, NULL, &lrd, NULL);
+                log->syncpt = log->sync;
+        } else
+                lsn = log->lsn;
+        /*
+         *      setup next syncpt trigger (SWAG)
+         */
+        logsize = log->logsize;
+        logdiff(written, lsn, log);
+        free = logsize - written;
+        delta = LOGSYNC_DELTA(logsize);
+        more = min(free / 2, delta);
+        if (more < 2 * LOGPSIZE) {
+                jfs_warn("\n ... Log Wrap ... Log Wrap ... Log Wrap ...\n");
+                /*
+                 *      log wrapping
+                 *
+                 * option 1 - panic ? No.!
+                 * option 2 - shutdown file systems
+                 *            associated with log ?
+                 * option 3 - extend log ?
+                 */
+                /*
+                 * option 4 - second chance
+                 *
+                 * mark log wrapped, and continue.
+                 * when all active transactions are completed,
+                 * mark log vaild for recovery.
+                 * if crashed during invalid state, log state
+                 * implies invald log, forcing fsck().
+                 */
+                /* mark log state log wrap in log superblock */
+                /* log->state = LOGWRAP; */
+                /* reset sync point computation */
+                log->syncpt = log->sync = lsn;
+                log->nextsync = delta;
+        } else
+                /* next syncpt trigger = written + more */
+                log->nextsync = written + more;
+        /* return if lmLogSync() from outside of transaction, e.g., sync() */
+        if (nosyncwait)
+                return lsn;
+        /* if number of bytes written from last sync point is more
+         * than 1/4 of the log size, stop new transactions from
+         * starting until all current transactions are completed
+         * by setting syncbarrier flag.
+         */
+        if (written > LOGSYNC_BARRIER(logsize) && logsize > 32 * LOGPSIZE) {
+                set_bit(log_SYNCBARRIER, &log->flag);
+                jfs_info("log barrier on: lsn=0x%x syncpt=0x%x", lsn,
+                         log->syncpt);
+                /*
+                 * We may have to initiate group commit
+                 */
+                jfs_flush_journal(log, 0);
+        }
+        return lsn;
+}
+/*
+ * NAME:        lmLogOpen()
+ *
+ * FUNCTION:    open the log on first open;
+ *      insert filesystem in the active list of the log.
+ *
+ * PARAMETER:   ipmnt   - file system mount inode
+ *              iplog   - log inode (out)
+ *
+ * RETURN:
+ *
+ * serialization:
+ */
+int lmLogOpen(struct super_block *sb)
+{
+        int rc;
+        struct block_device *bdev;
+        struct jfs_log *log;
+        struct jfs_sb_info *sbi = JFS_SBI(sb);
+        if (sbi->flag & JFS_NOINTEGRITY)
+                return open_dummy_log(sb);
+        
+        if (sbi->mntflag & JFS_INLINELOG)
+                return open_inline_log(sb);
+        down(&jfs_log_sem);
+        list_for_each_entry(log, &jfs_external_logs, journal_list) {
+                if (log->bdev->bd_dev == sbi->logdev) {
+                        if (memcmp(log->uuid, sbi->loguuid,
+                                   sizeof(log->uuid))) {
+                                jfs_warn("wrong uuid on JFS journal\n");
+                                up(&jfs_log_sem);
+                                return -EINVAL;
+                        }
+                        /*
+                         * add file system to log active file system list
+                         */
+                        if ((rc = lmLogFileSystem(log, sbi, 1))) {
+                                up(&jfs_log_sem);
+                                return rc;
+                        }
+                        goto journal_found;
+                }
+        }
+        if (!(log = kmalloc(sizeof(struct jfs_log), GFP_KERNEL))) {
+                up(&jfs_log_sem);
+                return -ENOMEM;
+        }
+        memset(log, 0, sizeof(struct jfs_log));
+        INIT_LIST_HEAD(&log->sb_list);
+        init_waitqueue_head(&log->syncwait);
+        /*
+         *      external log as separate logical volume
+         *
+         * file systems to log may have n-to-1 relationship;
+         */
+        bdev = open_by_devnum(sbi->logdev, FMODE_READ|FMODE_WRITE);
+        if (IS_ERR(bdev)) {
+                rc = -PTR_ERR(bdev);
+                goto free;
+        }
+        if ((rc = bd_claim(bdev, log))) {
+                goto close;
+        }
+        log->bdev = bdev;
+        memcpy(log->uuid, sbi->loguuid, sizeof(log->uuid));
+        
+        /*
+         * initialize log:
+         */
+        if ((rc = lmLogInit(log)))
+                goto unclaim;
+        list_add(&log->journal_list, &jfs_external_logs);
+        /*
+         * add file system to log active file system list
+         */
+        if ((rc = lmLogFileSystem(log, sbi, 1)))
+                goto shutdown;
+journal_found:
+        LOG_LOCK(log);
+        list_add(&sbi->log_list, &log->sb_list);
+        sbi->log = log;
+        LOG_UNLOCK(log);
+        up(&jfs_log_sem);
+        return 0;
+        /*
+         *      unwind on error
+         */
+      shutdown:         /* unwind lbmLogInit() */
+        list_del(&log->journal_list);
+        lbmLogShutdown(log);
+      unclaim:
+        bd_release(bdev);
+      close:            /* close external log device */
+        blkdev_put(bdev);
+      free:             /* free log descriptor */
+        up(&jfs_log_sem);
+        kfree(log);
+        jfs_warn("lmLogOpen: exit(%d)", rc);
+        return rc;
+}
+static int open_inline_log(struct super_block *sb)
+{
+        struct jfs_log *log;
+        int rc;
+        if (!(log = kmalloc(sizeof(struct jfs_log), GFP_KERNEL)))
+                return -ENOMEM;
+        memset(log, 0, sizeof(struct jfs_log));
+        INIT_LIST_HEAD(&log->sb_list);
+        init_waitqueue_head(&log->syncwait);
+        set_bit(log_INLINELOG, &log->flag);
+        log->bdev = sb->s_bdev;
+        log->base = addressPXD(&JFS_SBI(sb)->logpxd);
+        log->size = lengthPXD(&JFS_SBI(sb)->logpxd) >>
+            (L2LOGPSIZE - sb->s_blocksize_bits);
+        log->l2bsize = sb->s_blocksize_bits;
+        ASSERT(L2LOGPSIZE >= sb->s_blocksize_bits);
+        /*
+         * initialize log.
+         */
+        if ((rc = lmLogInit(log))) {
+                kfree(log);
+                jfs_warn("lmLogOpen: exit(%d)", rc);
+                return rc;
+        }
+        list_add(&JFS_SBI(sb)->log_list, &log->sb_list);
+        JFS_SBI(sb)->log = log;
+        return rc;
+}
+static int open_dummy_log(struct super_block *sb)
+{
+        int rc;
+        down(&jfs_log_sem);
+        if (!dummy_log) {
+                dummy_log = kmalloc(sizeof(struct jfs_log), GFP_KERNEL);
+                if (!dummy_log) {
+                        up(&jfs_log_sem);
+                        return -ENOMEM;
+                }
+                memset(dummy_log, 0, sizeof(struct jfs_log));
+                INIT_LIST_HEAD(&dummy_log->sb_list);
+                init_waitqueue_head(&dummy_log->syncwait);
+                dummy_log->no_integrity = 1;
+                /* Make up some stuff */
+                dummy_log->base = 0;
+                dummy_log->size = 1024;
+                rc = lmLogInit(dummy_log);
+                if (rc) {
+                        kfree(dummy_log);
+                        dummy_log = NULL;
+                        up(&jfs_log_sem);
+                        return rc;
+                }
+        }
+        LOG_LOCK(dummy_log);
+        list_add(&JFS_SBI(sb)->log_list, &dummy_log->sb_list);
+        JFS_SBI(sb)->log = dummy_log;
+        LOG_UNLOCK(dummy_log);
+        up(&jfs_log_sem);
+        return 0;
+}
+/*
+ * NAME:        lmLogInit()
+ *
+ * FUNCTION:    log initialization at first log open.
+ *
+ *      logredo() (or logformat()) should have been run previously.
+ *      initialize the log from log superblock.
+ *      set the log state in the superblock to LOGMOUNT and
+ *      write SYNCPT log record.
+ *              
+ * PARAMETER:   log     - log structure
+ *
+ * RETURN:      0       - if ok
+ *              -EINVAL - bad log magic number or superblock dirty
+ *              error returned from logwait()
+ *                      
+ * serialization: single first open thread
+ */
+int lmLogInit(struct jfs_log * log)
+{
+        int rc = 0;
+        struct lrd lrd;
+        struct logsuper *logsuper;
+        struct lbuf *bpsuper;
+        struct lbuf *bp;
+        struct logpage *lp;
+        int lsn = 0;
+        jfs_info("lmLogInit: log:0x%p", log);
+        /* initialize the group commit serialization lock */
+        LOGGC_LOCK_INIT(log);
+        /* allocate/initialize the log write serialization lock */
+        LOG_LOCK_INIT(log);
+        LOGSYNC_LOCK_INIT(log);
+        INIT_LIST_HEAD(&log->synclist);
+        INIT_LIST_HEAD(&log->cqueue);
+        log->flush_tblk = NULL;
+        log->count = 0;
+        /*
+         * initialize log i/o
+         */
+        if ((rc = lbmLogInit(log)))
+                return rc;
+        if (!test_bit(log_INLINELOG, &log->flag))
+                log->l2bsize = L2LOGPSIZE;
+        
+        /* check for disabled journaling to disk */
+        if (log->no_integrity) {
+                /*
+                 * Journal pages will still be filled.  When the time comes
+                 * to actually do the I/O, the write is not done, and the
+                 * endio routine is called directly.
+                 */
+                bp = lbmAllocate(log , 0);
+                log->bp = bp;
+                bp->l_pn = bp->l_eor = 0;
+        } else {
+                /*
+                 * validate log superblock
+                 */
+                if ((rc = lbmRead(log, 1, &bpsuper)))
+                        goto errout10;
+                logsuper = (struct logsuper *) bpsuper->l_ldata;
+                if (logsuper->magic != cpu_to_le32(LOGMAGIC)) {
+                        jfs_warn("*** Log Format Error ! ***");
+                        rc = -EINVAL;
+                        goto errout20;
+                }
+                /* logredo() should have been run successfully. */
+                if (logsuper->state != cpu_to_le32(LOGREDONE)) {
+                        jfs_warn("*** Log Is Dirty ! ***");
+                        rc = -EINVAL;
+                        goto errout20;
+                }
+                /* initialize log from log superblock */
+                if (test_bit(log_INLINELOG,&log->flag)) {
+                        if (log->size != le32_to_cpu(logsuper->size)) {
+                                rc = -EINVAL;
+                                goto errout20;
+                        }
+                        jfs_info("lmLogInit: inline log:0x%p base:0x%Lx "
+                                 "size:0x%x", log,
+                                 (unsigned long long) log->base, log->size);
+                } else {
+                        if (memcmp(logsuper->uuid, log->uuid, 16)) {
+                                jfs_warn("wrong uuid on JFS log device");
+                                goto errout20;
+                        }
+                        log->size = le32_to_cpu(logsuper->size);
+                        log->l2bsize = le32_to_cpu(logsuper->l2bsize);
+                        jfs_info("lmLogInit: external log:0x%p base:0x%Lx "
+                                 "size:0x%x", log,
+                                 (unsigned long long) log->base, log->size);
+                }
+                log->page = le32_to_cpu(logsuper->end) / LOGPSIZE;
+                log->eor = le32_to_cpu(logsuper->end) - (LOGPSIZE * log->page);
+                /*
+                 * initialize for log append write mode
+                 */
+                /* establish current/end-of-log page/buffer */
+                if ((rc = lbmRead(log, log->page, &bp)))
+                        goto errout20;
+                lp = (struct logpage *) bp->l_ldata;
+                jfs_info("lmLogInit: lsn:0x%x page:%d eor:%d:%d",
+                         le32_to_cpu(logsuper->end), log->page, log->eor,
+                         le16_to_cpu(lp->h.eor));
+                log->bp = bp;
+                bp->l_pn = log->page;
+                bp->l_eor = log->eor;
+                /* if current page is full, move on to next page */
+                if (log->eor >= LOGPSIZE - LOGPTLRSIZE)
+                        lmNextPage(log);
+                /*
+                 * initialize log syncpoint
+                 */
+                /*
+                 * write the first SYNCPT record with syncpoint = 0
+                 * (i.e., log redo up to HERE !);
+                 * remove current page from lbm write queue at end of pageout
+                 * (to write log superblock update), but do not release to
+                 * freelist;
+                 */
+                lrd.logtid = 0;
+                lrd.backchain = 0;
+                lrd.type = cpu_to_le16(LOG_SYNCPT);
+                lrd.length = 0;
+                lrd.log.syncpt.sync = 0;
+                lsn = lmWriteRecord(log, NULL, &lrd, NULL);
+                bp = log->bp;
+                bp->l_ceor = bp->l_eor;
+                lp = (struct logpage *) bp->l_ldata;
+                lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor);
+                lbmWrite(log, bp, lbmWRITE | lbmSYNC, 0);
+                if ((rc = lbmIOWait(bp, 0)))
+                        goto errout30;
+                /*
+                 * update/write superblock
+                 */
+                logsuper->state = cpu_to_le32(LOGMOUNT);
+                log->serial = le32_to_cpu(logsuper->serial) + 1;
+                logsuper->serial = cpu_to_le32(log->serial);
+                lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC);
+                if ((rc = lbmIOWait(bpsuper, lbmFREE)))
+                        goto errout30;
+        }
+        /* initialize logsync parameters */
+        log->logsize = (log->size - 2) << L2LOGPSIZE;
+        log->lsn = lsn;
+        log->syncpt = lsn;
+        log->sync = log->syncpt;
+        log->nextsync = LOGSYNC_DELTA(log->logsize);
+        jfs_info("lmLogInit: lsn:0x%x syncpt:0x%x sync:0x%x",
+                 log->lsn, log->syncpt, log->sync);
+        /*
+         * initialize for lazy/group commit
+         */
+        log->clsn = lsn;
+        return 0;
+        /*
+         *      unwind on error
+         */
+      errout30:         /* release log page */
+        log->wqueue = NULL;
+        bp->l_wqnext = NULL;
+        lbmFree(bp);
+      errout20:         /* release log superblock */
+        lbmFree(bpsuper);
+      errout10:         /* unwind lbmLogInit() */
+        lbmLogShutdown(log);
+        jfs_warn("lmLogInit: exit(%d)", rc);
+        return rc;
+}
+/*
+ * NAME:        lmLogClose()
+ *
+ * FUNCTION:    remove file system <ipmnt> from active list of log <iplog>
+ *              and close it on last close.
+ *
+ * PARAMETER:   sb      - superblock
+ *
+ * RETURN:      errors from subroutines
+ *
+ * serialization:
+ */
+int lmLogClose(struct super_block *sb)
+{
+        struct jfs_sb_info *sbi = JFS_SBI(sb);
+        struct jfs_log *log = sbi->log;
+        struct block_device *bdev;
+        int rc = 0;
+        jfs_info("lmLogClose: log:0x%p", log);
+        down(&jfs_log_sem);
+        LOG_LOCK(log);
+        list_del(&sbi->log_list);
+        LOG_UNLOCK(log);
+        sbi->log = NULL;
+        /*
+         * We need to make sure all of the "written" metapages
+         * actually make it to disk
+         */
+        sync_blockdev(sb->s_bdev);
+        if (test_bit(log_INLINELOG, &log->flag)) {
+                /*
+                 *      in-line log in host file system
+                 */
+                rc = lmLogShutdown(log);
+                kfree(log);
+                goto out;
+        }
+        if (!log->no_integrity)
+                lmLogFileSystem(log, sbi, 0);
+        if (!list_empty(&log->sb_list))
+                goto out;
+        /*
+         * TODO: ensure that the dummy_log is in a state to allow
+         * lbmLogShutdown to deallocate all the buffers and call
+         * kfree against dummy_log.  For now, leave dummy_log & its
+         * buffers in memory, and resuse if another no-integrity mount
+         * is requested.
+         */
+        if (log->no_integrity)
+                goto out;
+        /*
+         *      external log as separate logical volume
+         */
+        list_del(&log->journal_list);
+        bdev = log->bdev;
+        rc = lmLogShutdown(log);
+        bd_release(bdev);
+        blkdev_put(bdev);
+        kfree(log);
+      out:
+        up(&jfs_log_sem);
+        jfs_info("lmLogClose: exit(%d)", rc);
+        return rc;
+}
+/*
+ * NAME:        jfs_flush_journal()
+ *
+ * FUNCTION:    initiate write of any outstanding transactions to the journal
+ *              and optionally wait until they are all written to disk
+ *
+ *              wait == 0  flush until latest txn is committed, don't wait
+ *              wait == 1  flush until latest txn is committed, wait
+ *              wait > 1   flush until all txn's are complete, wait
+ */
+void jfs_flush_journal(struct jfs_log *log, int wait)
+{
+        int i;
+        struct tblock *target = NULL;
+        /* jfs_write_inode may call us during read-only mount */
+        if (!log)
+                return;
+        jfs_info("jfs_flush_journal: log:0x%p wait=%d", log, wait);
+        LOGGC_LOCK(log);
+        if (!list_empty(&log->cqueue)) {
+                /*
+                 * This ensures that we will keep writing to the journal as long
+                 * as there are unwritten commit records
+                 */
+                target = list_entry(log->cqueue.prev, struct tblock, cqueue);
+                if (test_bit(log_FLUSH, &log->flag)) {
+                        /*
+                         * We're already flushing.
+                         * if flush_tblk is NULL, we are flushing everything,
+                         * so leave it that way.  Otherwise, update it to the
+                         * latest transaction
+                         */
+                        if (log->flush_tblk)
+                                log->flush_tblk = target;
+                } else {
+                        /* Only flush until latest transaction is committed */
+                        log->flush_tblk = target;
+                        set_bit(log_FLUSH, &log->flag);
+                        /*
+                         * Initiate I/O on outstanding transactions
+                         */
+                        if (!(log->cflag & logGC_PAGEOUT)) {
+                                log->cflag |= logGC_PAGEOUT;
+                                lmGCwrite(log, 0);
+                        }
+                }
+        }
+        if ((wait > 1) || test_bit(log_SYNCBARRIER, &log->flag)) {
+                /* Flush until all activity complete */
+                set_bit(log_FLUSH, &log->flag);
+                log->flush_tblk = NULL;
+        }
+        if (wait && target && !(target->flag & tblkGC_COMMITTED)) {
+                DECLARE_WAITQUEUE(__wait, current);
+                add_wait_queue(&target->gcwait, &__wait);
+                set_current_state(TASK_UNINTERRUPTIBLE);
+                LOGGC_UNLOCK(log);
+                schedule();
+                current->state = TASK_RUNNING;
+                LOGGC_LOCK(log);
+                remove_wait_queue(&target->gcwait, &__wait);
+        }
+        LOGGC_UNLOCK(log);
+        if (wait < 2)
+                return;
+        /*
+         * If there was recent activity, we may need to wait
+         * for the lazycommit thread to catch up
+         */
+        if ((!list_empty(&log->cqueue)) || !list_empty(&log->synclist)) {
+                for (i = 0; i < 800; i++) {     /* Too much? */
+                        msleep(250);
+                        if (list_empty(&log->cqueue) &&
+                            list_empty(&log->synclist))
+                                break;
+                }
+        }
+        assert(list_empty(&log->cqueue));
+        assert(list_empty(&log->synclist));
+        clear_bit(log_FLUSH, &log->flag);
+}
+/*
+ * NAME:        lmLogShutdown()
+ *
+ * FUNCTION:    log shutdown at last LogClose().
+ *
+ *              write log syncpt record.
+ *              update super block to set redone flag to 0.
+ *
+ * PARAMETER:   log     - log inode
+ *
+ * RETURN:      0       - success
+ *                      
+ * serialization: single last close thread
+ */
+int lmLogShutdown(struct jfs_log * log)
+{
+        int rc;
+        struct lrd lrd;
+        int lsn;
+        struct logsuper *logsuper;
+        struct lbuf *bpsuper;
+        struct lbuf *bp;
+        struct logpage *lp;
+        jfs_info("lmLogShutdown: log:0x%p", log);
+        jfs_flush_journal(log, 2);
+        /*
+         * write the last SYNCPT record with syncpoint = 0
+         * (i.e., log redo up to HERE !)
+         */
+        lrd.logtid = 0;
+        lrd.backchain = 0;
+        lrd.type = cpu_to_le16(LOG_SYNCPT);
+        lrd.length = 0;
+        lrd.log.syncpt.sync = 0;
+        
+        lsn = lmWriteRecord(log, NULL, &lrd, NULL);
+        bp = log->bp;
+        lp = (struct logpage *) bp->l_ldata;
+        lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor);
+        lbmWrite(log, log->bp, lbmWRITE | lbmRELEASE | lbmSYNC, 0);
+        lbmIOWait(log->bp, lbmFREE);
+        /*
+         * synchronous update log superblock
+         * mark log state as shutdown cleanly
+         * (i.e., Log does not need to be replayed).
+         */
+        if ((rc = lbmRead(log, 1, &bpsuper)))
+                goto out;
+        logsuper = (struct logsuper *) bpsuper->l_ldata;
+        logsuper->state = cpu_to_le32(LOGREDONE);
+        logsuper->end = cpu_to_le32(lsn);
+        lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC);
+        rc = lbmIOWait(bpsuper, lbmFREE);
+        jfs_info("lmLogShutdown: lsn:0x%x page:%d eor:%d",
+                 lsn, log->page, log->eor);
+      out:    
+        /*
+         * shutdown per log i/o
+         */
+        lbmLogShutdown(log);
+        if (rc) {
+                jfs_warn("lmLogShutdown: exit(%d)", rc);
+        }
+        return rc;
+}
+/*
+ * NAME:        lmLogFileSystem()
+ *
+ * FUNCTION:    insert (<activate> = true)/remove (<activate> = false)
+ *      file system into/from log active file system list.
+ *
+ * PARAMETE:    log     - pointer to logs inode.
+ *              fsdev   - kdev_t of filesystem.
+ *              serial  - pointer to returned log serial number
+ *              activate - insert/remove device from active list.
+ *
+ * RETURN:      0       - success
+ *              errors returned by vms_iowait().
+ */
+static int lmLogFileSystem(struct jfs_log * log, struct jfs_sb_info *sbi,
+                           int activate)
+{
+        int rc = 0;
+        int i;
+        struct logsuper *logsuper;
+        struct lbuf *bpsuper;
+        char *uuid = sbi->uuid;
+        /*
+         * insert/remove file system device to log active file system list.
+         */
+        if ((rc = lbmRead(log, 1, &bpsuper)))
+                return rc;
+        logsuper = (struct logsuper *) bpsuper->l_ldata;
+        if (activate) {
+                for (i = 0; i < MAX_ACTIVE; i++)
+                        if (!memcmp(logsuper->active[i].uuid, NULL_UUID, 16)) {
+                                memcpy(logsuper->active[i].uuid, uuid, 16);
+                                sbi->aggregate = i;
+                                break;
+                        }
+                if (i == MAX_ACTIVE) {
+                        jfs_warn("Too many file systems sharing journal!");
+                        lbmFree(bpsuper);
+                        return -EMFILE; /* Is there a better rc? */
+                }
+        } else {
+                for (i = 0; i < MAX_ACTIVE; i++)
+                        if (!memcmp(logsuper->active[i].uuid, uuid, 16)) {
+                                memcpy(logsuper->active[i].uuid, NULL_UUID, 16);
+                                break;
+                        }
+                if (i == MAX_ACTIVE) {
+                        jfs_warn("Somebody stomped on the journal!");
+                        lbmFree(bpsuper);
+                        return -EIO;
+                }
+                
+        }
+        /*
+         * synchronous write log superblock:
+         *
+         * write sidestream bypassing write queue:
+         * at file system mount, log super block is updated for
+         * activation of the file system before any log record
+         * (MOUNT record) of the file system, and at file system
+         * unmount, all meta data for the file system has been
+         * flushed before log super block is updated for deactivation
+         * of the file system.
+         */
+        lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC);
+        rc = lbmIOWait(bpsuper, lbmFREE);
+        return rc;
+}
+/*
+ *              log buffer manager (lbm)
+ *              ------------------------
+ *
+ * special purpose buffer manager supporting log i/o requirements.
+ *
+ * per log write queue:
+ * log pageout occurs in serial order by fifo write queue and
+ * restricting to a single i/o in pregress at any one time.
+ * a circular singly-linked list
+ * (log->wrqueue points to the tail, and buffers are linked via
+ * bp->wrqueue field), and
+ * maintains log page in pageout ot waiting for pageout in serial pageout.
+ */
+/*
+ *      lbmLogInit()
+ *
+ * initialize per log I/O setup at lmLogInit()
+ */
+static int lbmLogInit(struct jfs_log * log)
+{                               /* log inode */
+        int i;
+        struct lbuf *lbuf;
+        jfs_info("lbmLogInit: log:0x%p", log);
+        /* initialize current buffer cursor */
+        log->bp = NULL;
+        /* initialize log device write queue */
+        log->wqueue = NULL;
+        /*
+         * Each log has its own buffer pages allocated to it.  These are
+         * not managed by the page cache.  This ensures that a transaction
+         * writing to the log does not block trying to allocate a page from
+         * the page cache (for the log).  This would be bad, since page
+         * allocation waits on the kswapd thread that may be committing inodes
+         * which would cause log activity.  Was that clear?  I'm trying to
+         * avoid deadlock here.
+         */
+        init_waitqueue_head(&log->free_wait);
+        log->lbuf_free = NULL;
+        for (i = 0; i < LOGPAGES; i++) {
+                lbuf = kmalloc(sizeof(struct lbuf), GFP_KERNEL);
+                if (lbuf == 0)
+                        goto error;
+                lbuf->l_ldata = (char *) get_zeroed_page(GFP_KERNEL);
+                if (lbuf->l_ldata == 0) {
+                        kfree(lbuf);
+                        goto error;
+                }
+                lbuf->l_log = log;
+                init_waitqueue_head(&lbuf->l_ioevent);
+                lbuf->l_freelist = log->lbuf_free;
+                log->lbuf_free = lbuf;
+        }
+        return (0);
+      error:
+        lbmLogShutdown(log);
+        return -ENOMEM;
+}
+/*
+ *      lbmLogShutdown()
+ *
+ * finalize per log I/O setup at lmLogShutdown()
+ */
+static void lbmLogShutdown(struct jfs_log * log)
+{
+        struct lbuf *lbuf;
+        jfs_info("lbmLogShutdown: log:0x%p", log);
+        lbuf = log->lbuf_free;
+        while (lbuf) {
+                struct lbuf *next = lbuf->l_freelist;
+                free_page((unsigned long) lbuf->l_ldata);
+                kfree(lbuf);
+                lbuf = next;
+        }
+        log->bp = NULL;
+}
+/*
+ *      lbmAllocate()
+ *
+ * allocate an empty log buffer
+ */
+static struct lbuf *lbmAllocate(struct jfs_log * log, int pn)
+{
+        struct lbuf *bp;
+        unsigned long flags;
+        /*
+         * recycle from log buffer freelist if any
+         */
+        LCACHE_LOCK(flags);
+        LCACHE_SLEEP_COND(log->free_wait, (bp = log->lbuf_free), flags);
+        log->lbuf_free = bp->l_freelist;
+        LCACHE_UNLOCK(flags);
+        bp->l_flag = 0;
+        bp->l_wqnext = NULL;
+        bp->l_freelist = NULL;
+        bp->l_pn = pn;
+        bp->l_blkno = log->base + (pn << (L2LOGPSIZE - log->l2bsize));
+        bp->l_ceor = 0;
+        return bp;
+}
+/*
+ *      lbmFree()
+ *
+ * release a log buffer to freelist
+ */
+static void lbmFree(struct lbuf * bp)
+{
+        unsigned long flags;
+        LCACHE_LOCK(flags);
+        lbmfree(bp);
+        LCACHE_UNLOCK(flags);
+}
+static void lbmfree(struct lbuf * bp)
+{
+        struct jfs_log *log = bp->l_log;
+        assert(bp->l_wqnext == NULL);
+        /*
+         * return the buffer to head of freelist
+         */
+        bp->l_freelist = log->lbuf_free;
+        log->lbuf_free = bp;
+        wake_up(&log->free_wait);
+        return;
+}
+/*
+ * NAME:        lbmRedrive
+ *
+ * FUNCTION:    add a log buffer to the the log redrive list
+ *
+ * PARAMETER:
+ *     bp       - log buffer
+ *
+ * NOTES:
+ *      Takes log_redrive_lock.
+ */
+static inline void lbmRedrive(struct lbuf *bp)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&log_redrive_lock, flags);
+        bp->l_redrive_next = log_redrive_list;
+        log_redrive_list = bp;
+        spin_unlock_irqrestore(&log_redrive_lock, flags);
+        wake_up(&jfs_IO_thread_wait);
+}
+/*
+ *      lbmRead()
+ */
+static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
+{
+        struct bio *bio;
+        struct lbuf *bp;
+        /*
+         * allocate a log buffer
+         */
+        *bpp = bp = lbmAllocate(log, pn);
+        jfs_info("lbmRead: bp:0x%p pn:0x%x", bp, pn);
+        bp->l_flag |= lbmREAD;
+        bio = bio_alloc(GFP_NOFS, 1);
+        bio->bi_sector = bp->l_blkno << (log->l2bsize - 9);
+        bio->bi_bdev = log->bdev;
+        bio->bi_io_vec[0].bv_page = virt_to_page(bp->l_ldata);
+        bio->bi_io_vec[0].bv_len = LOGPSIZE;
+        bio->bi_io_vec[0].bv_offset = 0;
+        bio->bi_vcnt = 1;
+        bio->bi_idx = 0;
+        bio->bi_size = LOGPSIZE;
+        bio->bi_end_io = lbmIODone;
+        bio->bi_private = bp;
+        submit_bio(READ_SYNC, bio);
+        wait_event(bp->l_ioevent, (bp->l_flag != lbmREAD));
+        return 0;
+}
+/*
+ *      lbmWrite()
+ *
+ * buffer at head of pageout queue stays after completion of
+ * partial-page pageout and redriven by explicit initiation of
+ * pageout by caller until full-page pageout is completed and
+ * released.
+ *
+ * device driver i/o done redrives pageout of new buffer at
+ * head of pageout queue when current buffer at head of pageout
+ * queue is released at the completion of its full-page pageout.
+ *
+ * LOGGC_LOCK() serializes lbmWrite() by lmNextPage() and lmGroupCommit().
+ * LCACHE_LOCK() serializes xflag between lbmWrite() and lbmIODone()
+ */
+static void lbmWrite(struct jfs_log * log, struct lbuf * bp, int flag,
+                     int cant_block)
+{
+        struct lbuf *tail;
+        unsigned long flags;
+        jfs_info("lbmWrite: bp:0x%p flag:0x%x pn:0x%x", bp, flag, bp->l_pn);
+        /* map the logical block address to physical block address */
+        bp->l_blkno =
+            log->base + (bp->l_pn << (L2LOGPSIZE - log->l2bsize));
+        LCACHE_LOCK(flags);             /* disable+lock */
+        /*
+         * initialize buffer for device driver
+         */
+        bp->l_flag = flag;
+        /*
+         *      insert bp at tail of write queue associated with log
+         *
+         * (request is either for bp already/currently at head of queue
+         * or new bp to be inserted at tail)
+         */
+        tail = log->wqueue;
+        /* is buffer not already on write queue ? */
+        if (bp->l_wqnext == NULL) {
+                /* insert at tail of wqueue */
+                if (tail == NULL) {
+                        log->wqueue = bp;
+                        bp->l_wqnext = bp;
+                } else {
+                        log->wqueue = bp;
+                        bp->l_wqnext = tail->l_wqnext;
+                        tail->l_wqnext = bp;
+                }
+                tail = bp;
+        }
+        /* is buffer at head of wqueue and for write ? */
+        if ((bp != tail->l_wqnext) || !(flag & lbmWRITE)) {
+                LCACHE_UNLOCK(flags);   /* unlock+enable */
+                return;
+        }
+        LCACHE_UNLOCK(flags);   /* unlock+enable */
+        if (cant_block)
+                lbmRedrive(bp);
+        else if (flag & lbmSYNC)
+                lbmStartIO(bp);
+        else {
+                LOGGC_UNLOCK(log);
+                lbmStartIO(bp);
+                LOGGC_LOCK(log);
+        }
+}
+/*
+ *      lbmDirectWrite()
+ *
+ * initiate pageout bypassing write queue for sidestream
+ * (e.g., log superblock) write;
+ */
+static void lbmDirectWrite(struct jfs_log * log, struct lbuf * bp, int flag)
+{
+        jfs_info("lbmDirectWrite: bp:0x%p flag:0x%x pn:0x%x",
+                 bp, flag, bp->l_pn);
+        /*
+         * initialize buffer for device driver
+         */
+        bp->l_flag = flag | lbmDIRECT;
+        /* map the logical block address to physical block address */
+        bp->l_blkno =
+            log->base + (bp->l_pn << (L2LOGPSIZE - log->l2bsize));
+        /*
+         *      initiate pageout of the page
+         */
+        lbmStartIO(bp);
+}
+/*
+ * NAME:        lbmStartIO()
+ *
+ * FUNCTION:    Interface to DD strategy routine
+ *
+ * RETURN:      none
+ *
+ * serialization: LCACHE_LOCK() is NOT held during log i/o;
+ */
+static void lbmStartIO(struct lbuf * bp)
+{
+        struct bio *bio;
+        struct jfs_log *log = bp->l_log;
+        jfs_info("lbmStartIO\n");
+        bio = bio_alloc(GFP_NOFS, 1);
+        bio->bi_sector = bp->l_blkno << (log->l2bsize - 9);
+        bio->bi_bdev = log->bdev;
+        bio->bi_io_vec[0].bv_page = virt_to_page(bp->l_ldata);
+        bio->bi_io_vec[0].bv_len = LOGPSIZE;
+        bio->bi_io_vec[0].bv_offset = 0;
+        bio->bi_vcnt = 1;
+        bio->bi_idx = 0;
+        bio->bi_size = LOGPSIZE;
+        bio->bi_end_io = lbmIODone;
+        bio->bi_private = bp;
+        /* check if journaling to disk has been disabled */
+        if (!log->no_integrity) {
+                submit_bio(WRITE_SYNC, bio);
+                INCREMENT(lmStat.submitted);
+        }
+        else {
+                bio->bi_size = 0;
+                lbmIODone(bio, 0, 0); /* 2nd argument appears to not be used => 0
+                                       *  3rd argument appears to not be used => 0
+                                       */
+        }
+}
+/*
+ *      lbmIOWait()
+ */
+static int lbmIOWait(struct lbuf * bp, int flag)
+{
+        unsigned long flags;
+        int rc = 0;
+        jfs_info("lbmIOWait1: bp:0x%p flag:0x%x:0x%x", bp, bp->l_flag, flag);
+        LCACHE_LOCK(flags);             /* disable+lock */
+        LCACHE_SLEEP_COND(bp->l_ioevent, (bp->l_flag & lbmDONE), flags);
+        rc = (bp->l_flag & lbmERROR) ? -EIO : 0;
+        if (flag & lbmFREE)
+                lbmfree(bp);
+        LCACHE_UNLOCK(flags);   /* unlock+enable */
+        jfs_info("lbmIOWait2: bp:0x%p flag:0x%x:0x%x", bp, bp->l_flag, flag);
+        return rc;
+}
+/*
+ *      lbmIODone()
+ *
+ * executed at INTIODONE level
+ */
+static int lbmIODone(struct bio *bio, unsigned int bytes_done, int error)
+{
+        struct lbuf *bp = bio->bi_private;
+        struct lbuf *nextbp, *tail;
+        struct jfs_log *log;
+        unsigned long flags;
+        if (bio->bi_size)
+                return 1;
+        /*
+         * get back jfs buffer bound to the i/o buffer
+         */
+        jfs_info("lbmIODone: bp:0x%p flag:0x%x", bp, bp->l_flag);
+        LCACHE_LOCK(flags);             /* disable+lock */
+        bp->l_flag |= lbmDONE;
+        if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+                bp->l_flag |= lbmERROR;
+                jfs_err("lbmIODone: I/O error in JFS log");
+        }
+        bio_put(bio);
+        /*
+         *      pagein completion
+         */
+        if (bp->l_flag & lbmREAD) {
+                bp->l_flag &= ~lbmREAD;
+                LCACHE_UNLOCK(flags);   /* unlock+enable */
+                /* wakeup I/O initiator */
+                LCACHE_WAKEUP(&bp->l_ioevent);
+                return 0;
+        }
+        /*
+         *      pageout completion
+         *
+         * the bp at the head of write queue has completed pageout.
+         *
+         * if single-commit/full-page pageout, remove the current buffer
+         * from head of pageout queue, and redrive pageout with
+         * the new buffer at head of pageout queue;
+         * otherwise, the partial-page pageout buffer stays at
+         * the head of pageout queue to be redriven for pageout
+         * by lmGroupCommit() until full-page pageout is completed.
+         */
+        bp->l_flag &= ~lbmWRITE;
+        INCREMENT(lmStat.pagedone);
+        /* update committed lsn */
+        log = bp->l_log;
+        log->clsn = (bp->l_pn << L2LOGPSIZE) + bp->l_ceor;
+        if (bp->l_flag & lbmDIRECT) {
+                LCACHE_WAKEUP(&bp->l_ioevent);
+                LCACHE_UNLOCK(flags);
+                return 0;
+        }
+        tail = log->wqueue;
+        /* single element queue */
+        if (bp == tail) {
+                /* remove head buffer of full-page pageout
+                 * from log device write queue
+                 */
+                if (bp->l_flag & lbmRELEASE) {
+                        log->wqueue = NULL;
+                        bp->l_wqnext = NULL;
+                }
+        }
+        /* multi element queue */
+        else {
+                /* remove head buffer of full-page pageout
+                 * from log device write queue
+                 */
+                if (bp->l_flag & lbmRELEASE) {
+                        nextbp = tail->l_wqnext = bp->l_wqnext;
+                        bp->l_wqnext = NULL;
+                        /*
+                         * redrive pageout of next page at head of write queue:
+                         * redrive next page without any bound tblk
+                         * (i.e., page w/o any COMMIT records), or
+                         * first page of new group commit which has been
+                         * queued after current page (subsequent pageout
+                         * is performed synchronously, except page without
+                         * any COMMITs) by lmGroupCommit() as indicated
+                         * by lbmWRITE flag;
+                         */
+                        if (nextbp->l_flag & lbmWRITE) {
+                                /*
+                                 * We can't do the I/O at interrupt time.
+                                 * The jfsIO thread can do it
+                                 */
+                                lbmRedrive(nextbp);
+                        }
+                }
+        }
+        /*
+         *      synchronous pageout:
+         *
+         * buffer has not necessarily been removed from write queue
+         * (e.g., synchronous write of partial-page with COMMIT):
+         * leave buffer for i/o initiator to dispose
+         */
+        if (bp->l_flag & lbmSYNC) {
+                LCACHE_UNLOCK(flags);   /* unlock+enable */
+                /* wakeup I/O initiator */
+                LCACHE_WAKEUP(&bp->l_ioevent);
+        }
+        /*
+         *      Group Commit pageout:
+         */
+        else if (bp->l_flag & lbmGC) {
+                LCACHE_UNLOCK(flags);
+                lmPostGC(bp);
+        }
+        /*
+         *      asynchronous pageout:
+         *
+         * buffer must have been removed from write queue:
+         * insert buffer at head of freelist where it can be recycled
+         */
+        else {
+                assert(bp->l_flag & lbmRELEASE);
+                assert(bp->l_flag & lbmFREE);
+                lbmfree(bp);
+                LCACHE_UNLOCK(flags);   /* unlock+enable */
+        }
+        return 0;
+}
+int jfsIOWait(void *arg)
+{
+        struct lbuf *bp;
+        daemonize("jfsIO");
+        complete(&jfsIOwait);
+        do {
+                DECLARE_WAITQUEUE(wq, current);
+                spin_lock_irq(&log_redrive_lock);
+                while ((bp = log_redrive_list) != 0) {
+                        log_redrive_list = bp->l_redrive_next;
+                        bp->l_redrive_next = NULL;
+                        spin_unlock_irq(&log_redrive_lock);
+                        lbmStartIO(bp);
+                        spin_lock_irq(&log_redrive_lock);
+                }
+                if (current->flags & PF_FREEZE) {
+                        spin_unlock_irq(&log_redrive_lock);
+                        refrigerator(PF_FREEZE);
+                } else {
+                        add_wait_queue(&jfs_IO_thread_wait, &wq);
+                        set_current_state(TASK_INTERRUPTIBLE);
+                        spin_unlock_irq(&log_redrive_lock);
+                        schedule();
+                        current->state = TASK_RUNNING;
+                        remove_wait_queue(&jfs_IO_thread_wait, &wq);
+                }
+        } while (!jfs_stop_threads);
+        jfs_info("jfsIOWait being killed!");
+        complete_and_exit(&jfsIOwait, 0);
+}
+/*
+ * NAME:        lmLogFormat()/jfs_logform()
+ *
+ * FUNCTION:    format file system log
+ *
+ * PARAMETERS:
+ *      log     - volume log
+ *      logAddress - start address of log space in FS block
+ *      logSize - length of log space in FS block;
+ *
+ * RETURN:      0       - success
+ *              -EIO    - i/o error
+ *
+ * XXX: We're synchronously writing one page at a time.  This needs to
+ *      be improved by writing multiple pages at once.
+ */
+int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize)
+{
+        int rc = -EIO;
+        struct jfs_sb_info *sbi;
+        struct logsuper *logsuper;
+        struct logpage *lp;
+        int lspn;               /* log sequence page number */
+        struct lrd *lrd_ptr;
+        int npages = 0;
+        struct lbuf *bp;
+        jfs_info("lmLogFormat: logAddress:%Ld logSize:%d",
+                 (long long)logAddress, logSize);
+        sbi = list_entry(log->sb_list.next, struct jfs_sb_info, log_list);
+        /* allocate a log buffer */
+        bp = lbmAllocate(log, 1);
+        npages = logSize >> sbi->l2nbperpage;
+        /*
+         *      log space:
+         *
+         * page 0 - reserved;
+         * page 1 - log superblock;
+         * page 2 - log data page: A SYNC log record is written
+         *          into this page at logform time;
+         * pages 3-N - log data page: set to empty log data pages;
+         */
+        /*
+         *      init log superblock: log page 1
+         */
+        logsuper = (struct logsuper *) bp->l_ldata;
+        logsuper->magic = cpu_to_le32(LOGMAGIC);
+        logsuper->version = cpu_to_le32(LOGVERSION);
+        logsuper->state = cpu_to_le32(LOGREDONE);
+        logsuper->flag = cpu_to_le32(sbi->mntflag);     /* ? */
+        logsuper->size = cpu_to_le32(npages);
+        logsuper->bsize = cpu_to_le32(sbi->bsize);
+        logsuper->l2bsize = cpu_to_le32(sbi->l2bsize);
+        logsuper->end = cpu_to_le32(2 * LOGPSIZE + LOGPHDRSIZE + LOGRDSIZE);
+        bp->l_flag = lbmWRITE | lbmSYNC | lbmDIRECT;
+        bp->l_blkno = logAddress + sbi->nbperpage;
+        lbmStartIO(bp);
+        if ((rc = lbmIOWait(bp, 0)))
+                goto exit;
+        /*
+         *      init pages 2 to npages-1 as log data pages:
+         *
+         * log page sequence number (lpsn) initialization:
+         *
+         * pn:   0     1     2     3                 n-1
+         *       +-----+-----+=====+=====+===.....===+=====+
+         * lspn:             N-1   0     1           N-2
+         *                   <--- N page circular file ---->
+         *
+         * the N (= npages-2) data pages of the log is maintained as
+         * a circular file for the log records;
+         * lpsn grows by 1 monotonically as each log page is written
+         * to the circular file of the log;
+         * and setLogpage() will not reset the page number even if
+         * the eor is equal to LOGPHDRSIZE. In order for binary search
+         * still work in find log end process, we have to simulate the
+         * log wrap situation at the log format time.
+         * The 1st log page written will have the highest lpsn. Then
+         * the succeeding log pages will have ascending order of
+         * the lspn starting from 0, ... (N-2)
+         */
+        lp = (struct logpage *) bp->l_ldata;
+        /*
+         * initialize 1st log page to be written: lpsn = N - 1,
+         * write a SYNCPT log record is written to this page
+         */
+        lp->h.page = lp->t.page = cpu_to_le32(npages - 3);
+        lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE + LOGRDSIZE);
+        lrd_ptr = (struct lrd *) &lp->data;
+        lrd_ptr->logtid = 0;
+        lrd_ptr->backchain = 0;
+        lrd_ptr->type = cpu_to_le16(LOG_SYNCPT);
+        lrd_ptr->length = 0;
+        lrd_ptr->log.syncpt.sync = 0;
+        bp->l_blkno += sbi->nbperpage;
+        bp->l_flag = lbmWRITE | lbmSYNC | lbmDIRECT;
+        lbmStartIO(bp);
+        if ((rc = lbmIOWait(bp, 0)))
+                goto exit;
+        /*
+         *      initialize succeeding log pages: lpsn = 0, 1, ..., (N-2)
+         */
+        for (lspn = 0; lspn < npages - 3; lspn++) {
+                lp->h.page = lp->t.page = cpu_to_le32(lspn);
+                lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE);
+                bp->l_blkno += sbi->nbperpage;
+                bp->l_flag = lbmWRITE | lbmSYNC | lbmDIRECT;
+                lbmStartIO(bp);
+                if ((rc = lbmIOWait(bp, 0)))
+                        goto exit;
+        }
+        rc = 0;
+exit:
+        /*
+         *      finalize log
+         */
+        /* release the buffer */
+        lbmFree(bp);
+        return rc;
+}
+#ifdef CONFIG_JFS_STATISTICS
+int jfs_lmstats_read(char *buffer, char **start, off_t offset, int length,
+                      int *eof, void *data)
+{
+        int len = 0;
+        off_t begin;
+        len += sprintf(buffer,
+                       "JFS Logmgr stats\n"
+                       "================\n"
+                       "commits = %d\n"
+                       "writes submitted = %d\n"
+                       "writes completed = %d\n"
+                       "full pages submitted = %d\n"
+                       "partial pages submitted = %d\n",
+                       lmStat.commit,
+                       lmStat.submitted,
+                       lmStat.pagedone,
+                       lmStat.full_page,
+                       lmStat.partial_page);
+        begin = offset;
+        *start = buffer + begin;
+        len -= begin;
+        if (len > length)
+                len = length;
+        else
+                *eof = 1;
+        if (len < 0)
+                len = 0;
+        return len;
+}
+#endif /* CONFIG_JFS_STATISTICS */
diff --git a/fs/jfs/jfs_logmgr.h b/fs/jfs/jfs_logmgr.h
new file mode 100644
index 000000000000..141ad74010c9
--- /dev/null
+++ b/fs/jfs/jfs_logmgr.h
@@ -0,0 +1,510 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *   Portions Copyright (C) Christoph Hellwig, 2001-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_LOGMGR
+#define _H_JFS_LOGMGR
+#include "jfs_filsys.h"
+#include "jfs_lock.h"
+/*
+ *      log manager configuration parameters
+ */
+/* log page size */
+#define LOGPSIZE        4096
+#define L2LOGPSIZE      12
+#define LOGPAGES        16      /* Log pages per mounted file system */
+/*
+ *      log logical volume
+ *
+ * a log is used to make the commit operation on journalled 
+ * files within the same logical volume group atomic.
+ * a log is implemented with a logical volume.
+ * there is one log per logical volume group. 
+ *
+ * block 0 of the log logical volume is not used (ipl etc).
+ * block 1 contains a log "superblock" and is used by logFormat(),
+ * lmLogInit(), lmLogShutdown(), and logRedo() to record status 
+ * of the log but is not otherwise used during normal processing. 
+ * blocks 2 - (N-1) are used to contain log records.
+ *
+ * when a volume group is varied-on-line, logRedo() must have 
+ * been executed before the file systems (logical volumes) in 
+ * the volume group can be mounted.
+ */
+/*
+ *      log superblock (block 1 of logical volume)
+ */
+#define LOGSUPER_B      1
+#define LOGSTART_B      2
+#define LOGMAGIC        0x87654321
+#define LOGVERSION      1
+#define MAX_ACTIVE      128     /* Max active file systems sharing log */
+struct logsuper {
+        __le32 magic;           /* 4: log lv identifier */
+        __le32 version;         /* 4: version number */
+        __le32 serial;          /* 4: log open/mount counter */
+        __le32 size;            /* 4: size in number of LOGPSIZE blocks */
+        __le32 bsize;           /* 4: logical block size in byte */
+        __le32 l2bsize;         /* 4: log2 of bsize */
+        __le32 flag;            /* 4: option */
+        __le32 state;           /* 4: state - see below */
+        __le32 end;             /* 4: addr of last log record set by logredo */
+        char uuid[16];          /* 16: 128-bit journal uuid */
+        char label[16];         /* 16: journal label */
+        struct {
+                char uuid[16];
+        } active[MAX_ACTIVE];   /* 2048: active file systems list */
+};
+#define NULL_UUID "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+/* log flag: commit option (see jfs_filsys.h) */
+/* log state */
+#define LOGMOUNT        0       /* log mounted by lmLogInit() */
+#define LOGREDONE       1       /* log shutdown by lmLogShutdown().
+                                 * log redo completed by logredo().
+                                 */
+#define LOGWRAP         2       /* log wrapped */
+#define LOGREADERR      3       /* log read error detected in logredo() */
+/*
+ *      log logical page
+ *
+ * (this comment should be rewritten !)
+ * the header and trailer structures (h,t) will normally have 
+ * the same page and eor value.
+ * An exception to this occurs when a complete page write is not 
+ * accomplished on a power failure. Since the hardware may "split write"
+ * sectors in the page, any out of order sequence may occur during powerfail 
+ * and needs to be recognized during log replay.  The xor value is
+ * an "exclusive or" of all log words in the page up to eor.  This
+ * 32 bit eor is stored with the top 16 bits in the header and the
+ * bottom 16 bits in the trailer.  logredo can easily recognize pages
+ * that were not completed by reconstructing this eor and checking 
+ * the log page.
+ *
+ * Previous versions of the operating system did not allow split 
+ * writes and detected partially written records in logredo by 
+ * ordering the updates to the header, trailer, and the move of data 
+ * into the logdata area.  The order: (1) data is moved (2) header 
+ * is updated (3) trailer is updated.  In logredo, when the header 
+ * differed from the trailer, the header and trailer were reconciled 
+ * as follows: if h.page != t.page they were set to the smaller of 
+ * the two and h.eor and t.eor set to 8 (i.e. empty page). if (only) 
+ * h.eor != t.eor they were set to the smaller of their two values.
+ */
+struct logpage {
+        struct {                /* header */
+                __le32 page;    /* 4: log sequence page number */
+                __le16 rsrvd;   /* 2: */
+                __le16 eor;     /* 2: end-of-log offset of lasrt record write */
+        } h;
+        __le32 data[LOGPSIZE / 4 - 4];  /* log record area */
+        struct {                /* trailer */
+                __le32 page;    /* 4: normally the same as h.page */
+                __le16 rsrvd;   /* 2: */
+                __le16 eor;     /* 2: normally the same as h.eor */
+        } t;
+};
+#define LOGPHDRSIZE     8       /* log page header size */
+#define LOGPTLRSIZE     8       /* log page trailer size */
+/*
+ *      log record
+ *
+ * (this comment should be rewritten !)
+ * jfs uses only "after" log records (only a single writer is allowed
+ * in a  page, pages are written to temporary paging space if
+ * if they must be written to disk before commit, and i/o is
+ * scheduled for modified pages to their home location after
+ * the log records containing the after values and the commit 
+ * record is written to the log on disk, undo discards the copy
+ * in main-memory.)
+ *
+ * a log record consists of a data area of variable length followed by 
+ * a descriptor of fixed size LOGRDSIZE bytes.
+ * the  data area is rounded up to an integral number of 4-bytes and 
+ * must be no longer than LOGPSIZE.
+ * the descriptor is of size of multiple of 4-bytes and aligned on a 
+ * 4-byte boundary. 
+ * records are packed one after the other in the data area of log pages.
+ * (sometimes a DUMMY record is inserted so that at least one record ends 
+ * on every page or the longest record is placed on at most two pages).
+ * the field eor in page header/trailer points to the byte following 
+ * the last record on a page.
+ */
+/* log record types */
+#define LOG_COMMIT              0x8000
+#define LOG_SYNCPT              0x4000
+#define LOG_MOUNT               0x2000
+#define LOG_REDOPAGE            0x0800
+#define LOG_NOREDOPAGE          0x0080
+#define LOG_NOREDOINOEXT        0x0040
+#define LOG_UPDATEMAP           0x0008
+#define LOG_NOREDOFILE          0x0001
+/* REDOPAGE/NOREDOPAGE log record data type */
+#define LOG_INODE               0x0001
+#define LOG_XTREE               0x0002
+#define LOG_DTREE               0x0004
+#define LOG_BTROOT              0x0010
+#define LOG_EA                  0x0020
+#define LOG_ACL                 0x0040
+#define LOG_DATA                0x0080
+#define LOG_NEW                 0x0100
+#define LOG_EXTEND              0x0200
+#define LOG_RELOCATE            0x0400
+#define LOG_DIR_XTREE           0x0800  /* Xtree is in directory inode */
+/* UPDATEMAP log record descriptor type */
+#define LOG_ALLOCXADLIST        0x0080
+#define LOG_ALLOCPXDLIST        0x0040
+#define LOG_ALLOCXAD            0x0020
+#define LOG_ALLOCPXD            0x0010
+#define LOG_FREEXADLIST         0x0008
+#define LOG_FREEPXDLIST         0x0004
+#define LOG_FREEXAD             0x0002
+#define LOG_FREEPXD             0x0001
+struct lrd {
+        /*
+         * type independent area
+         */
+        __le32 logtid;          /* 4: log transaction identifier */
+        __le32 backchain;       /* 4: ptr to prev record of same transaction */
+        __le16 type;            /* 2: record type */
+        __le16 length;          /* 2: length of data in record (in byte) */
+        __le32 aggregate;       /* 4: file system lv/aggregate */
+        /* (16) */
+        /*
+         * type dependent area (20)
+         */
+        union {
+                /*
+                 *      COMMIT: commit
+                 *
+                 * transaction commit: no type-dependent information;
+                 */
+                /*
+                 *      REDOPAGE: after-image
+                 *
+                 * apply after-image;
+                 *
+                 * N.B. REDOPAGE, NOREDOPAGE, and UPDATEMAP must be same format;
+                 */
+                struct {
+                        __le32 fileset; /* 4: fileset number */
+                        __le32 inode;   /* 4: inode number */
+                        __le16 type;    /* 2: REDOPAGE record type */
+                        __le16 l2linesize;      /* 2: log2 of line size */
+                        pxd_t pxd;      /* 8: on-disk page pxd */
+                } redopage;     /* (20) */
+                /*
+                 *      NOREDOPAGE: the page is freed
+                 *
+                 * do not apply after-image records which precede this record
+                 * in the log with the same page block number to this page.
+                 *
+                 * N.B. REDOPAGE, NOREDOPAGE, and UPDATEMAP must be same format;
+                 */
+                struct {
+                        __le32 fileset; /* 4: fileset number */
+                        __le32 inode;   /* 4: inode number */
+                        __le16 type;    /* 2: NOREDOPAGE record type */
+                        __le16 rsrvd;   /* 2: reserved */
+                        pxd_t pxd;      /* 8: on-disk page pxd */
+                } noredopage;   /* (20) */
+                /*
+                 *      UPDATEMAP: update block allocation map
+                 *
+                 * either in-line PXD,
+                 * or     out-of-line  XADLIST;
+                 *
+                 * N.B. REDOPAGE, NOREDOPAGE, and UPDATEMAP must be same format;
+                 */
+                struct {
+                        __le32 fileset; /* 4: fileset number */
+                        __le32 inode;   /* 4: inode number */
+                        __le16 type;    /* 2: UPDATEMAP record type */
+                        __le16 nxd;     /* 2: number of extents */
+                        pxd_t pxd;      /* 8: pxd */
+                } updatemap;    /* (20) */
+                /*
+                 *      NOREDOINOEXT: the inode extent is freed
+                 *
+                 * do not apply after-image records which precede this 
+                 * record in the log with the any of the 4 page block 
+                 * numbers in this inode extent. 
+                 * 
+                 * NOTE: The fileset and pxd fields MUST remain in 
+                 *       the same fields in the REDOPAGE record format.
+                 *
+                 */
+                struct {
+                        __le32 fileset; /* 4: fileset number */
+                        __le32 iagnum;  /* 4: IAG number     */
+                        __le32 inoext_idx;      /* 4: inode extent index */
+                        pxd_t pxd;      /* 8: on-disk page pxd */
+                } noredoinoext; /* (20) */
+                /*
+                 *      SYNCPT: log sync point
+                 *
+                 * replay log upto syncpt address specified;
+                 */
+                struct {
+                        __le32 sync;    /* 4: syncpt address (0 = here) */
+                } syncpt;
+                /*
+                 *      MOUNT: file system mount
+                 *
+                 * file system mount: no type-dependent information;
+                 */
+                /*
+                 *      ? FREEXTENT: free specified extent(s)
+                 *
+                 * free specified extent(s) from block allocation map
+                 * N.B.: nextents should be length of data/sizeof(xad_t)
+                 */
+                struct {
+                        __le32 type;    /* 4: FREEXTENT record type */
+                        __le32 nextent; /* 4: number of extents */
+                        /* data: PXD or XAD list */
+                } freextent;
+                /*
+                 *      ? NOREDOFILE: this file is freed
+                 *
+                 * do not apply records which precede this record in the log
+                 * with the same inode number.
+                 *
+                 * NOREDILE must be the first to be written at commit
+                 * (last to be read in logredo()) - it prevents
+                 * replay of preceding updates of all preceding generations
+                 * of the inumber esp. the on-disk inode itself, 
+                 * but does NOT prevent
+                 * replay of the 
+                 */
+                struct {
+                        __le32 fileset; /* 4: fileset number */
+                        __le32 inode;   /* 4: inode number */
+                } noredofile;
+                /*
+                 *      ? NEWPAGE: 
+                 *
+                 * metadata type dependent
+                 */
+                struct {
+                        __le32 fileset; /* 4: fileset number */
+                        __le32 inode;   /* 4: inode number */
+                        __le32 type;    /* 4: NEWPAGE record type */
+                        pxd_t pxd;      /* 8: on-disk page pxd */
+                } newpage;
+                /*
+                 *      ? DUMMY: filler
+                 *
+                 * no type-dependent information
+                 */
+        } log;
+};                                      /* (36) */
+#define LOGRDSIZE       (sizeof(struct lrd))
+/*
+ *      line vector descriptor
+ */
+struct lvd {
+        __le16 offset;
+        __le16 length;
+};
+/*
+ *      log logical volume
+ */
+struct jfs_log {
+        struct list_head sb_list;/*  This is used to sync metadata
+                                 *    before writing syncpt.
+                                 */
+        struct list_head journal_list; /* Global list */
+        struct block_device *bdev; /* 4: log lv pointer */
+        int serial;             /* 4: log mount serial number */
+        s64 base;               /* @8: log extent address (inline log ) */
+        int size;               /* 4: log size in log page (in page) */
+        int l2bsize;            /* 4: log2 of bsize */
+        long flag;              /* 4: flag */
+        struct lbuf *lbuf_free; /* 4: free lbufs */
+        wait_queue_head_t free_wait;    /* 4: */
+        /* log write */
+        int logtid;             /* 4: log tid */
+        int page;               /* 4: page number of eol page */
+        int eor;                /* 4: eor of last record in eol page */
+        struct lbuf *bp;        /* 4: current log page buffer */
+        struct semaphore loglock;       /* 4: log write serialization lock */
+        /* syncpt */
+        int nextsync;           /* 4: bytes to write before next syncpt */
+        int active;             /* 4: */
+        wait_queue_head_t syncwait;     /* 4: */
+        /* commit */
+        uint cflag;             /* 4: */
+        struct list_head cqueue; /* FIFO commit queue */
+        struct tblock *flush_tblk; /* tblk we're waiting on for flush */
+        int gcrtc;              /* 4: GC_READY transaction count */
+        struct tblock *gclrt;   /* 4: latest GC_READY transaction */
+        spinlock_t gclock;      /* 4: group commit lock */
+        int logsize;            /* 4: log data area size in byte */
+        int lsn;                /* 4: end-of-log */
+        int clsn;               /* 4: clsn */
+        int syncpt;             /* 4: addr of last syncpt record */
+        int sync;               /* 4: addr from last logsync() */
+        struct list_head synclist;      /* 8: logsynclist anchor */
+        spinlock_t synclock;    /* 4: synclist lock */
+        struct lbuf *wqueue;    /* 4: log pageout queue */
+        int count;              /* 4: count */
+        char uuid[16];          /* 16: 128-bit uuid of log device */
+        int no_integrity;       /* 3: flag to disable journaling to disk */
+};
+/*
+ * Log flag
+ */
+#define log_INLINELOG   1
+#define log_SYNCBARRIER 2
+#define log_QUIESCE     3
+#define log_FLUSH       4
+/*
+ * group commit flag
+ */
+/* jfs_log */
+#define logGC_PAGEOUT   0x00000001
+/* tblock/lbuf */
+#define tblkGC_QUEUE            0x0001
+#define tblkGC_READY            0x0002
+#define tblkGC_COMMIT           0x0004
+#define tblkGC_COMMITTED        0x0008
+#define tblkGC_EOP              0x0010
+#define tblkGC_FREE             0x0020
+#define tblkGC_LEADER           0x0040
+#define tblkGC_ERROR            0x0080
+#define tblkGC_LAZY             0x0100  // D230860
+#define tblkGC_UNLOCKED         0x0200  // D230860
+/*
+ *              log cache buffer header
+ */
+struct lbuf {
+        struct jfs_log *l_log;  /* 4: log associated with buffer */
+        /*
+         * data buffer base area
+         */
+        uint l_flag;            /* 4: pageout control flags */
+        struct lbuf *l_wqnext;  /* 4: write queue link */
+        struct lbuf *l_freelist;        /* 4: freelistlink */
+        int l_pn;               /* 4: log page number */
+        int l_eor;              /* 4: log record eor */
+        int l_ceor;             /* 4: committed log record eor */
+        s64 l_blkno;            /* 8: log page block number */
+        caddr_t l_ldata;        /* 4: data page */
+        wait_queue_head_t l_ioevent;    /* 4: i/o done event */
+        struct page *l_page;    /* The page itself */
+};
+/* Reuse l_freelist for redrive list */
+#define l_redrive_next l_freelist
+/*
+ *      logsynclist block
+ *
+ * common logsyncblk prefix for jbuf_t and tblock
+ */
+struct logsyncblk {
+        u16 xflag;              /* flags */
+        u16 flag;               /* only meaninful in tblock */
+        lid_t lid;              /* lock id */
+        s32 lsn;                /* log sequence number */
+        struct list_head synclist;      /* log sync list link */
+};
+/*
+ *      logsynclist serialization (per log)
+ */
+#define LOGSYNC_LOCK_INIT(log) spin_lock_init(&(log)->synclock)
+#define LOGSYNC_LOCK(log) spin_lock(&(log)->synclock)
+#define LOGSYNC_UNLOCK(log) spin_unlock(&(log)->synclock)
+/* compute the difference in bytes of lsn from sync point */
+#define logdiff(diff, lsn, log)\
+{\
+        diff = (lsn) - (log)->syncpt;\
+        if (diff < 0)\
+                diff += (log)->logsize;\
+}
+extern int lmLogOpen(struct super_block *sb);
+extern int lmLogClose(struct super_block *sb);
+extern int lmLogShutdown(struct jfs_log * log);
+extern int lmLogInit(struct jfs_log * log);
+extern int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize);
+extern void jfs_flush_journal(struct jfs_log * log, int wait);
+#endif                          /* _H_JFS_LOGMGR */
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
new file mode 100644
index 000000000000..4c0a3ac75c08
--- /dev/null
+++ b/fs/jfs/jfs_metapage.c
@@ -0,0 +1,580 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2003
+ *   Portions Copyright (C) Christoph Hellwig, 2001-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/buffer_head.h>
+#include <linux/mempool.h>
+#include <linux/delay.h>
+#include "jfs_incore.h"
+#include "jfs_superblock.h"
+#include "jfs_filsys.h"
+#include "jfs_metapage.h"
+#include "jfs_txnmgr.h"
+#include "jfs_debug.h"
+static DEFINE_SPINLOCK(meta_lock);
+#ifdef CONFIG_JFS_STATISTICS
+static struct {
+        uint    pagealloc;      /* # of page allocations */
+        uint    pagefree;       /* # of page frees */
+        uint    lockwait;       /* # of sleeping lock_metapage() calls */
+} mpStat;
+#endif
+#define HASH_BITS 10            /* This makes hash_table 1 4K page */
+#define HASH_SIZE (1 << HASH_BITS)
+static struct metapage **hash_table = NULL;
+static unsigned long hash_order;
+static inline int metapage_locked(struct metapage *mp)
+{
+        return test_bit(META_locked, &mp->flag);
+}
+static inline int trylock_metapage(struct metapage *mp)
+{
+        return test_and_set_bit(META_locked, &mp->flag);
+}
+static inline void unlock_metapage(struct metapage *mp)
+{
+        clear_bit(META_locked, &mp->flag);
+        wake_up(&mp->wait);
+}
+static void __lock_metapage(struct metapage *mp)
+{
+        DECLARE_WAITQUEUE(wait, current);
+        INCREMENT(mpStat.lockwait);
+        add_wait_queue_exclusive(&mp->wait, &wait);
+        do {
+                set_current_state(TASK_UNINTERRUPTIBLE);
+                if (metapage_locked(mp)) {
+                        spin_unlock(&meta_lock);
+                        schedule();
+                        spin_lock(&meta_lock);
+                }
+        } while (trylock_metapage(mp));
+        __set_current_state(TASK_RUNNING);
+        remove_wait_queue(&mp->wait, &wait);
+}
+/* needs meta_lock */
+static inline void lock_metapage(struct metapage *mp)
+{
+        if (trylock_metapage(mp))
+                __lock_metapage(mp);
+}
+#define METAPOOL_MIN_PAGES 32
+static kmem_cache_t *metapage_cache;
+static mempool_t *metapage_mempool;
+static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
+{
+        struct metapage *mp = (struct metapage *)foo;
+        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+            SLAB_CTOR_CONSTRUCTOR) {
+                mp->lid = 0;
+                mp->lsn = 0;
+                mp->flag = 0;
+                mp->data = NULL;
+                mp->clsn = 0;
+                mp->log = NULL;
+                set_bit(META_free, &mp->flag);
+                init_waitqueue_head(&mp->wait);
+        }
+}
+static inline struct metapage *alloc_metapage(int gfp_mask)
+{
+        return mempool_alloc(metapage_mempool, gfp_mask);
+}
+static inline void free_metapage(struct metapage *mp)
+{
+        mp->flag = 0;
+        set_bit(META_free, &mp->flag);
+        mempool_free(mp, metapage_mempool);
+}
+int __init metapage_init(void)
+{
+        /*
+         * Allocate the metapage structures
+         */
+        metapage_cache = kmem_cache_create("jfs_mp", sizeof(struct metapage),
+                                           0, 0, init_once, NULL);
+        if (metapage_cache == NULL)
+                return -ENOMEM;
+        metapage_mempool = mempool_create(METAPOOL_MIN_PAGES, mempool_alloc_slab,
+                                          mempool_free_slab, metapage_cache);
+        if (metapage_mempool == NULL) {
+                kmem_cache_destroy(metapage_cache);
+                return -ENOMEM;
+        }
+        /*
+         * Now the hash list
+         */
+        for (hash_order = 0;
+             ((PAGE_SIZE << hash_order) / sizeof(void *)) < HASH_SIZE;
+             hash_order++);
+        hash_table =
+            (struct metapage **) __get_free_pages(GFP_KERNEL, hash_order);
+        assert(hash_table);
+        memset(hash_table, 0, PAGE_SIZE << hash_order);
+        return 0;
+}
+void metapage_exit(void)
+{
+        mempool_destroy(metapage_mempool);
+        kmem_cache_destroy(metapage_cache);
+}
+/*
+ * Basically same hash as in pagemap.h, but using our hash table
+ */
+static struct metapage **meta_hash(struct address_space *mapping,
+                                   unsigned long index)
+{
+#define i (((unsigned long)mapping)/ \
+           (sizeof(struct inode) & ~(sizeof(struct inode) -1 )))
+#define s(x) ((x) + ((x) >> HASH_BITS))
+        return hash_table + (s(i + index) & (HASH_SIZE - 1));
+#undef i
+#undef s
+}
+static struct metapage *search_hash(struct metapage ** hash_ptr,
+                                    struct address_space *mapping,
+                               unsigned long index)
+{
+        struct metapage *ptr;
+        for (ptr = *hash_ptr; ptr; ptr = ptr->hash_next) {
+                if ((ptr->mapping == mapping) && (ptr->index == index))
+                        return ptr;
+        }
+        return NULL;
+}
+static void add_to_hash(struct metapage * mp, struct metapage ** hash_ptr)
+{
+        if (*hash_ptr)
+                (*hash_ptr)->hash_prev = mp;
+        mp->hash_prev = NULL;
+        mp->hash_next = *hash_ptr;
+        *hash_ptr = mp;
+}
+static void remove_from_hash(struct metapage * mp, struct metapage ** hash_ptr)
+{
+        if (mp->hash_prev)
+                mp->hash_prev->hash_next = mp->hash_next;
+        else {
+                assert(*hash_ptr == mp);
+                *hash_ptr = mp->hash_next;
+        }
+        if (mp->hash_next)
+                mp->hash_next->hash_prev = mp->hash_prev;
+}
+struct metapage *__get_metapage(struct inode *inode, unsigned long lblock,
+                                unsigned int size, int absolute,
+                                unsigned long new)
+{
+        struct metapage **hash_ptr;
+        int l2BlocksPerPage;
+        int l2bsize;
+        struct address_space *mapping;
+        struct metapage *mp;
+        unsigned long page_index;
+        unsigned long page_offset;
+        jfs_info("__get_metapage: inode = 0x%p, lblock = 0x%lx", inode, lblock);
+        if (absolute)
+                mapping = inode->i_sb->s_bdev->bd_inode->i_mapping;
+        else {
+                /*
+                 * If an nfs client tries to read an inode that is larger
+                 * than any existing inodes, we may try to read past the
+                 * end of the inode map
+                 */
+                if ((lblock << inode->i_blkbits) >= inode->i_size)
+                        return NULL;
+                mapping = inode->i_mapping;
+        }
+        hash_ptr = meta_hash(mapping, lblock);
+again:
+        spin_lock(&meta_lock);
+        mp = search_hash(hash_ptr, mapping, lblock);
+        if (mp) {
+              page_found:
+                if (test_bit(META_stale, &mp->flag)) {
+                        spin_unlock(&meta_lock);
+                        msleep(1);
+                        goto again;
+                }
+                mp->count++;
+                lock_metapage(mp);
+                spin_unlock(&meta_lock);
+                if (test_bit(META_discard, &mp->flag)) {
+                        if (!new) {
+                                jfs_error(inode->i_sb,
+                                          "__get_metapage: using a "
+                                          "discarded metapage");
+                                release_metapage(mp);
+                                return NULL;
+                        }
+                        clear_bit(META_discard, &mp->flag);
+                }
+                jfs_info("__get_metapage: found 0x%p, in hash", mp);
+                if (mp->logical_size != size) {
+                        jfs_error(inode->i_sb,
+                                  "__get_metapage: mp->logical_size != size");
+                        release_metapage(mp);
+                        return NULL;
+                }
+        } else {
+                l2bsize = inode->i_blkbits;
+                l2BlocksPerPage = PAGE_CACHE_SHIFT - l2bsize;
+                page_index = lblock >> l2BlocksPerPage;
+                page_offset = (lblock - (page_index << l2BlocksPerPage)) <<
+                    l2bsize;
+                if ((page_offset + size) > PAGE_CACHE_SIZE) {
+                        spin_unlock(&meta_lock);
+                        jfs_err("MetaData crosses page boundary!!");
+                        return NULL;
+                }
+                
+                /*
+                 * Locks held on aggregate inode pages are usually
+                 * not held long, and they are taken in critical code
+                 * paths (committing dirty inodes, txCommit thread) 
+                 * 
+                 * Attempt to get metapage without blocking, tapping into
+                 * reserves if necessary.
+                 */
+                mp = NULL;
+                if (JFS_IP(inode)->fileset == AGGREGATE_I) {
+                        mp = alloc_metapage(GFP_ATOMIC);
+                        if (!mp) {
+                                /*
+                                 * mempool is supposed to protect us from
+                                 * failing here.  We will try a blocking
+                                 * call, but a deadlock is possible here
+                                 */
+                                printk(KERN_WARNING
+                                       "__get_metapage: atomic call to mempool_alloc failed.\n");
+                                printk(KERN_WARNING
+                                       "Will attempt blocking call\n");
+                        }
+                }
+                if (!mp) {
+                        struct metapage *mp2;
+                        spin_unlock(&meta_lock);
+                        mp = alloc_metapage(GFP_NOFS);
+                        spin_lock(&meta_lock);
+                        /* we dropped the meta_lock, we need to search the
+                         * hash again.
+                         */
+                        mp2 = search_hash(hash_ptr, mapping, lblock);
+                        if (mp2) {
+                                free_metapage(mp);
+                                mp = mp2;
+                                goto page_found;
+                        }
+                }
+                mp->flag = 0;
+                lock_metapage(mp);
+                if (absolute)
+                        set_bit(META_absolute, &mp->flag);
+                mp->xflag = COMMIT_PAGE;
+                mp->count = 1;
+                atomic_set(&mp->nohomeok,0);
+                mp->mapping = mapping;
+                mp->index = lblock;
+                mp->page = NULL;
+                mp->logical_size = size;
+                add_to_hash(mp, hash_ptr);
+                spin_unlock(&meta_lock);
+                if (new) {
+                        jfs_info("__get_metapage: Calling grab_cache_page");
+                        mp->page = grab_cache_page(mapping, page_index);
+                        if (!mp->page) {
+                                jfs_err("grab_cache_page failed!");
+                                goto freeit;
+                        } else {
+                                INCREMENT(mpStat.pagealloc);
+                                unlock_page(mp->page);
+                        }
+                } else {
+                        jfs_info("__get_metapage: Calling read_cache_page");
+                        mp->page = read_cache_page(mapping, lblock,
+                                    (filler_t *)mapping->a_ops->readpage, NULL);
+                        if (IS_ERR(mp->page)) {
+                                jfs_err("read_cache_page failed!");
+                                goto freeit;
+                        } else
+                                INCREMENT(mpStat.pagealloc);
+                }
+                mp->data = kmap(mp->page) + page_offset;
+        }
+        if (new)
+                memset(mp->data, 0, PSIZE);
+        jfs_info("__get_metapage: returning = 0x%p", mp);
+        return mp;
+freeit:
+        spin_lock(&meta_lock);
+        remove_from_hash(mp, hash_ptr);
+        free_metapage(mp);
+        spin_unlock(&meta_lock);
+        return NULL;
+}
+void hold_metapage(struct metapage * mp, int force)
+{
+        spin_lock(&meta_lock);
+        mp->count++;
+        if (force) {
+                ASSERT (!(test_bit(META_forced, &mp->flag)));
+                if (trylock_metapage(mp))
+                        set_bit(META_forced, &mp->flag);
+        } else
+                lock_metapage(mp);
+        spin_unlock(&meta_lock);
+}
+static void __write_metapage(struct metapage * mp)
+{
+        int l2bsize = mp->mapping->host->i_blkbits;
+        int l2BlocksPerPage = PAGE_CACHE_SHIFT - l2bsize;
+        unsigned long page_index;
+        unsigned long page_offset;
+        int rc;
+        jfs_info("__write_metapage: mp = 0x%p", mp);
+        page_index = mp->page->index;
+        page_offset =
+            (mp->index - (page_index << l2BlocksPerPage)) << l2bsize;
+        lock_page(mp->page);
+        rc = mp->mapping->a_ops->prepare_write(NULL, mp->page, page_offset,
+                                               page_offset +
+                                               mp->logical_size);
+        if (rc) {
+                jfs_err("prepare_write return %d!", rc);
+                ClearPageUptodate(mp->page);
+                unlock_page(mp->page);
+                clear_bit(META_dirty, &mp->flag);
+                return;
+        }
+        rc = mp->mapping->a_ops->commit_write(NULL, mp->page, page_offset,
+                                              page_offset +
+                                              mp->logical_size);
+        if (rc) {
+                jfs_err("commit_write returned %d", rc);
+        }
+        unlock_page(mp->page);
+        clear_bit(META_dirty, &mp->flag);
+        jfs_info("__write_metapage done");
+}
+static inline void sync_metapage(struct metapage *mp)
+{
+        struct page *page = mp->page;
+        page_cache_get(page);
+        lock_page(page);
+        /* we're done with this page - no need to check for errors */
+        if (page_has_buffers(page))
+                write_one_page(page, 1);
+        else
+                unlock_page(page);
+        page_cache_release(page);
+}
+void release_metapage(struct metapage * mp)
+{
+        struct jfs_log *log;
+        jfs_info("release_metapage: mp = 0x%p, flag = 0x%lx", mp, mp->flag);
+        spin_lock(&meta_lock);
+        if (test_bit(META_forced, &mp->flag)) {
+                clear_bit(META_forced, &mp->flag);
+                mp->count--;
+                spin_unlock(&meta_lock);
+                return;
+        }
+        assert(mp->count);
+        if (--mp->count || atomic_read(&mp->nohomeok)) {
+                unlock_metapage(mp);
+                spin_unlock(&meta_lock);
+                return;
+        }
+        if (mp->page) {
+                set_bit(META_stale, &mp->flag);
+                spin_unlock(&meta_lock);
+                kunmap(mp->page);
+                mp->data = NULL;
+                if (test_bit(META_dirty, &mp->flag))
+                        __write_metapage(mp);
+                if (test_bit(META_sync, &mp->flag)) {
+                        sync_metapage(mp);
+                        clear_bit(META_sync, &mp->flag);
+                }
+                if (test_bit(META_discard, &mp->flag)) {
+                        lock_page(mp->page);
+                        block_invalidatepage(mp->page, 0);
+                        unlock_page(mp->page);
+                }
+                page_cache_release(mp->page);
+                mp->page = NULL;
+                INCREMENT(mpStat.pagefree);
+                spin_lock(&meta_lock);
+        }
+        if (mp->lsn) {
+                /*
+                 * Remove metapage from logsynclist.
+                 */
+                log = mp->log;
+                LOGSYNC_LOCK(log);
+                mp->log = NULL;
+                mp->lsn = 0;
+                mp->clsn = 0;
+                log->count--;
+                list_del(&mp->synclist);
+                LOGSYNC_UNLOCK(log);
+        }
+        remove_from_hash(mp, meta_hash(mp->mapping, mp->index));
+        spin_unlock(&meta_lock);
+        free_metapage(mp);
+}
+void __invalidate_metapages(struct inode *ip, s64 addr, int len)
+{
+        struct metapage **hash_ptr;
+        unsigned long lblock;
+        int l2BlocksPerPage = PAGE_CACHE_SHIFT - ip->i_blkbits;
+        /* All callers are interested in block device's mapping */
+        struct address_space *mapping = ip->i_sb->s_bdev->bd_inode->i_mapping;
+        struct metapage *mp;
+        struct page *page;
+        /*
+         * First, mark metapages to discard.  They will eventually be
+         * released, but should not be written.
+         */
+        for (lblock = addr; lblock < addr + len;
+             lblock += 1 << l2BlocksPerPage) {
+                hash_ptr = meta_hash(mapping, lblock);
+again:
+                spin_lock(&meta_lock);
+                mp = search_hash(hash_ptr, mapping, lblock);
+                if (mp) {
+                        if (test_bit(META_stale, &mp->flag)) {
+                                spin_unlock(&meta_lock);
+                                msleep(1);
+                                goto again;
+                        }
+                        clear_bit(META_dirty, &mp->flag);
+                        set_bit(META_discard, &mp->flag);
+                        spin_unlock(&meta_lock);
+                } else {
+                        spin_unlock(&meta_lock);
+                        page = find_lock_page(mapping, lblock>>l2BlocksPerPage);
+                        if (page) {
+                                block_invalidatepage(page, 0);
+                                unlock_page(page);
+                                page_cache_release(page);
+                        }
+                }
+        }
+}
+#ifdef CONFIG_JFS_STATISTICS
+int jfs_mpstat_read(char *buffer, char **start, off_t offset, int length,
+                    int *eof, void *data)
+{
+        int len = 0;
+        off_t begin;
+        len += sprintf(buffer,
+                       "JFS Metapage statistics\n"
+                       "=======================\n"
+                       "page allocations = %d\n"
+                       "page frees = %d\n"
+                       "lock waits = %d\n",
+                       mpStat.pagealloc,
+                       mpStat.pagefree,
+                       mpStat.lockwait);
+        begin = offset;
+        *start = buffer + begin;
+        len -= begin;
+        if (len > length)
+                len = length;
+        else
+                *eof = 1;
+        if (len < 0)
+                len = 0;
+        return len;
+}
+#endif
diff --git a/fs/jfs/jfs_metapage.h b/fs/jfs/jfs_metapage.h
new file mode 100644
index 000000000000..0e58aba58c37
--- /dev/null
+++ b/fs/jfs/jfs_metapage.h
@@ -0,0 +1,115 @@
+/*
+ *   Copyright (c) International Business Machines Corp., 2000-2002
+ *   Portions Copyright (c) Christoph Hellwig, 2001-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_METAPAGE
+#define _H_JFS_METAPAGE
+#include <linux/pagemap.h>
+struct metapage {
+        /* Common logsyncblk prefix (see jfs_logmgr.h) */
+        u16 xflag;
+        u16 unused;
+        lid_t lid;
+        int lsn;
+        struct list_head synclist;
+        /* End of logsyncblk prefix */
+        unsigned long flag;     /* See Below */
+        unsigned long count;    /* Reference count */
+        void *data;             /* Data pointer */
+        /* list management stuff */
+        struct metapage *hash_prev;
+        struct metapage *hash_next;     /* Also used for free list */
+        /*
+         * mapping & index become redundant, but we need these here to
+         * add the metapage to the hash before we have the real page
+         */
+        struct address_space *mapping;
+        unsigned long index;
+        wait_queue_head_t wait;
+        /* implementation */
+        struct page *page;
+        unsigned long logical_size;
+        /* Journal management */
+        int clsn;
+        atomic_t nohomeok;
+        struct jfs_log *log;
+};
+/* metapage flag */
+#define META_locked     0
+#define META_absolute   1
+#define META_free       2
+#define META_dirty      3
+#define META_sync       4
+#define META_discard    5
+#define META_forced     6
+#define META_stale      7
+#define mark_metapage_dirty(mp) set_bit(META_dirty, &(mp)->flag)
+/* function prototypes */
+extern struct metapage *__get_metapage(struct inode *inode,
+                                  unsigned long lblock, unsigned int size,
+                                  int absolute, unsigned long new);
+#define read_metapage(inode, lblock, size, absolute)\
+         __get_metapage(inode, lblock, size, absolute, FALSE)
+#define get_metapage(inode, lblock, size, absolute)\
+         __get_metapage(inode, lblock, size, absolute, TRUE)
+extern void release_metapage(struct metapage *);
+extern void hold_metapage(struct metapage *, int);
+static inline void write_metapage(struct metapage *mp)
+{
+        set_bit(META_dirty, &mp->flag);
+        release_metapage(mp);
+}
+static inline void flush_metapage(struct metapage *mp)
+{
+        set_bit(META_sync, &mp->flag);
+        write_metapage(mp);
+}
+static inline void discard_metapage(struct metapage *mp)
+{
+        clear_bit(META_dirty, &mp->flag);
+        set_bit(META_discard, &mp->flag);
+        release_metapage(mp);
+}
+/*
+ * This routines invalidate all pages for an extent.
+ */
+extern void __invalidate_metapages(struct inode *, s64, int);
+#define invalidate_pxd_metapages(ip, pxd) \
+        __invalidate_metapages((ip), addressPXD(&(pxd)), lengthPXD(&(pxd)))
+#define invalidate_dxd_metapages(ip, dxd) \
+        __invalidate_metapages((ip), addressDXD(&(dxd)), lengthDXD(&(dxd)))
+#define invalidate_xad_metapages(ip, xad) \
+        __invalidate_metapages((ip), addressXAD(&(xad)), lengthXAD(&(xad)))
+#endif                          /* _H_JFS_METAPAGE */
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
new file mode 100644
index 000000000000..c535ffd638e8
--- /dev/null
+++ b/fs/jfs/jfs_mount.c
@@ -0,0 +1,512 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+/*
+ * Module: jfs_mount.c
+ *
+ * note: file system in transition to aggregate/fileset:
+ *
+ * file system mount is interpreted as the mount of aggregate, 
+ * if not already mounted, and mount of the single/only fileset in 
+ * the aggregate;
+ *
+ * a file system/aggregate is represented by an internal inode
+ * (aka mount inode) initialized with aggregate superblock;
+ * each vfs represents a fileset, and points to its "fileset inode 
+ * allocation map inode" (aka fileset inode):
+ * (an aggregate itself is structured recursively as a filset: 
+ * an internal vfs is constructed and points to its "fileset inode 
+ * allocation map inode" (aka aggregate inode) where each inode 
+ * represents a fileset inode) so that inode number is mapped to 
+ * on-disk inode in uniform way at both aggregate and fileset level;
+ *
+ * each vnode/inode of a fileset is linked to its vfs (to facilitate
+ * per fileset inode operations, e.g., unmount of a fileset, etc.);
+ * each inode points to the mount inode (to facilitate access to
+ * per aggregate information, e.g., block size, etc.) as well as
+ * its file set inode.
+ *
+ *   aggregate 
+ *   ipmnt
+ *   mntvfs -> fileset ipimap+ -> aggregate ipbmap -> aggregate ipaimap;
+ *             fileset vfs     -> vp(1) <-> ... <-> vp(n) <->vproot;
+ */
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include "jfs_incore.h"
+#include "jfs_filsys.h"
+#include "jfs_superblock.h"
+#include "jfs_dmap.h"
+#include "jfs_imap.h"
+#include "jfs_metapage.h"
+#include "jfs_debug.h"
+/*
+ * forward references
+ */
+static int chkSuper(struct super_block *);
+static int logMOUNT(struct super_block *sb);
+/*
+ * NAME:        jfs_mount(sb)
+ *
+ * FUNCTION:    vfs_mount()
+ *
+ * PARAMETER:   sb      - super block
+ *
+ * RETURN:      -EBUSY  - device already mounted or open for write
+ *              -EBUSY  - cvrdvp already mounted;
+ *              -EBUSY  - mount table full
+ *              -ENOTDIR- cvrdvp not directory on a device mount
+ *              -ENXIO  - device open failure
+ */
+int jfs_mount(struct super_block *sb)
+{
+        int rc = 0;             /* Return code          */
+        struct jfs_sb_info *sbi = JFS_SBI(sb);
+        struct inode *ipaimap = NULL;
+        struct inode *ipaimap2 = NULL;
+        struct inode *ipimap = NULL;
+        struct inode *ipbmap = NULL;
+        /*
+         * read/validate superblock 
+         * (initialize mount inode from the superblock)
+         */
+        if ((rc = chkSuper(sb))) {
+                goto errout20;
+        }
+        ipaimap = diReadSpecial(sb, AGGREGATE_I, 0);
+        if (ipaimap == NULL) {
+                jfs_err("jfs_mount: Faild to read AGGREGATE_I");
+                rc = -EIO;
+                goto errout20;
+        }
+        sbi->ipaimap = ipaimap;
+        jfs_info("jfs_mount: ipaimap:0x%p", ipaimap);
+        /*
+         * initialize aggregate inode allocation map
+         */
+        if ((rc = diMount(ipaimap))) {
+                jfs_err("jfs_mount: diMount(ipaimap) failed w/rc = %d", rc);
+                goto errout21;
+        }
+        /*
+         * open aggregate block allocation map
+         */
+        ipbmap = diReadSpecial(sb, BMAP_I, 0);
+        if (ipbmap == NULL) {
+                rc = -EIO;
+                goto errout22;
+        }
+        jfs_info("jfs_mount: ipbmap:0x%p", ipbmap);
+        sbi->ipbmap = ipbmap;
+        /*
+         * initialize aggregate block allocation map
+         */
+        if ((rc = dbMount(ipbmap))) {
+                jfs_err("jfs_mount: dbMount failed w/rc = %d", rc);
+                goto errout22;
+        }
+        /*
+         * open the secondary aggregate inode allocation map
+         *
+         * This is a duplicate of the aggregate inode allocation map.
+         *
+         * hand craft a vfs in the same fashion as we did to read ipaimap.
+         * By adding INOSPEREXT (32) to the inode number, we are telling
+         * diReadSpecial that we are reading from the secondary aggregate
+         * inode table.  This also creates a unique entry in the inode hash
+         * table.
+         */
+        if ((sbi->mntflag & JFS_BAD_SAIT) == 0) {
+                ipaimap2 = diReadSpecial(sb, AGGREGATE_I, 1);
+                if (ipaimap2 == 0) {
+                        jfs_err("jfs_mount: Faild to read AGGREGATE_I");
+                        rc = -EIO;
+                        goto errout35;
+                }
+                sbi->ipaimap2 = ipaimap2;
+                jfs_info("jfs_mount: ipaimap2:0x%p", ipaimap2);
+                /*
+                 * initialize secondary aggregate inode allocation map
+                 */
+                if ((rc = diMount(ipaimap2))) {
+                        jfs_err("jfs_mount: diMount(ipaimap2) failed, rc = %d",
+                                rc);
+                        goto errout35;
+                }
+        } else
+                /* Secondary aggregate inode table is not valid */
+                sbi->ipaimap2 = NULL;
+        /*
+         *      mount (the only/single) fileset
+         */
+        /*
+         * open fileset inode allocation map (aka fileset inode)
+         */
+        ipimap = diReadSpecial(sb, FILESYSTEM_I, 0);
+        if (ipimap == NULL) {
+                jfs_err("jfs_mount: Failed to read FILESYSTEM_I");
+                /* open fileset secondary inode allocation map */
+                rc = -EIO;
+                goto errout40;
+        }
+        jfs_info("jfs_mount: ipimap:0x%p", ipimap);
+        /* map further access of per fileset inodes by the fileset inode */
+        sbi->ipimap = ipimap;
+        /* initialize fileset inode allocation map */
+        if ((rc = diMount(ipimap))) {
+                jfs_err("jfs_mount: diMount failed w/rc = %d", rc);
+                goto errout41;
+        }
+        goto out;
+        /*
+         *      unwind on error
+         */
+      errout41:         /* close fileset inode allocation map inode */
+        diFreeSpecial(ipimap);
+      errout40:         /* fileset closed */
+        /* close secondary aggregate inode allocation map */
+        if (ipaimap2) {
+                diUnmount(ipaimap2, 1);
+                diFreeSpecial(ipaimap2);
+        }
+      errout35:
+        /* close aggregate block allocation map */
+        dbUnmount(ipbmap, 1);
+        diFreeSpecial(ipbmap);
+      errout22:         /* close aggregate inode allocation map */
+        diUnmount(ipaimap, 1);
+      errout21:         /* close aggregate inodes */
+        diFreeSpecial(ipaimap);
+      errout20:         /* aggregate closed */
+      out:
+        if (rc)
+                jfs_err("Mount JFS Failure: %d", rc);
+        return rc;
+}
+/*
+ * NAME:        jfs_mount_rw(sb, remount)
+ *
+ * FUNCTION:    Completes read-write mount, or remounts read-only volume
+ *              as read-write
+ */
+int jfs_mount_rw(struct super_block *sb, int remount)
+{
+        struct jfs_sb_info *sbi = JFS_SBI(sb);  
+        int rc;
+        /*
+         * If we are re-mounting a previously read-only volume, we want to
+         * re-read the inode and block maps, since fsck.jfs may have updated
+         * them.
+         */
+        if (remount) {
+                if (chkSuper(sb) || (sbi->state != FM_CLEAN))
+                        return -EINVAL;
+                truncate_inode_pages(sbi->ipimap->i_mapping, 0);
+                truncate_inode_pages(sbi->ipbmap->i_mapping, 0);
+                diUnmount(sbi->ipimap, 1);
+                if ((rc = diMount(sbi->ipimap))) {
+                        jfs_err("jfs_mount_rw: diMount failed!");
+                        return rc;
+                }
+                dbUnmount(sbi->ipbmap, 1);
+                if ((rc = dbMount(sbi->ipbmap))) {
+                        jfs_err("jfs_mount_rw: dbMount failed!");
+                        return rc;
+                }
+        }
+        /*
+         * open/initialize log
+         */
+        if ((rc = lmLogOpen(sb)))
+                return rc;
+        /*
+         * update file system superblock;
+         */
+        if ((rc = updateSuper(sb, FM_MOUNT))) {
+                jfs_err("jfs_mount: updateSuper failed w/rc = %d", rc);
+                lmLogClose(sb);
+                return rc;
+        }
+        /*
+         * write MOUNT log record of the file system
+         */
+        logMOUNT(sb);
+        /*
+         * Set page cache allocation policy
+         */
+        mapping_set_gfp_mask(sb->s_bdev->bd_inode->i_mapping, GFP_NOFS);
+        return rc;
+}
+/*
+ *      chkSuper()
+ *
+ * validate the superblock of the file system to be mounted and 
+ * get the file system parameters.
+ *
+ * returns
+ *      0 with fragsize set if check successful
+ *      error code if not successful
+ */
+static int chkSuper(struct super_block *sb)
+{
+        int rc = 0;
+        struct jfs_sb_info *sbi = JFS_SBI(sb);
+        struct jfs_superblock *j_sb;
+        struct buffer_head *bh;
+        int AIM_bytesize, AIT_bytesize;
+        int expected_AIM_bytesize, expected_AIT_bytesize;
+        s64 AIM_byte_addr, AIT_byte_addr, fsckwsp_addr;
+        s64 byte_addr_diff0, byte_addr_diff1;
+        s32 bsize;
+        if ((rc = readSuper(sb, &bh)))
+                return rc;
+        j_sb = (struct jfs_superblock *)bh->b_data;
+        /*
+         * validate superblock
+         */
+        /* validate fs signature */
+        if (strncmp(j_sb->s_magic, JFS_MAGIC, 4) ||
+            le32_to_cpu(j_sb->s_version) > JFS_VERSION) {
+                rc = -EINVAL;
+                goto out;
+        }
+        bsize = le32_to_cpu(j_sb->s_bsize);
+#ifdef _JFS_4K
+        if (bsize != PSIZE) {
+                jfs_err("Currently only 4K block size supported!");
+                rc = -EINVAL;
+                goto out;
+        }
+#endif                          /* _JFS_4K */
+        jfs_info("superblock: flag:0x%08x state:0x%08x size:0x%Lx",
+                 le32_to_cpu(j_sb->s_flag), le32_to_cpu(j_sb->s_state),
+                 (unsigned long long) le64_to_cpu(j_sb->s_size));
+        /* validate the descriptors for Secondary AIM and AIT */
+        if ((j_sb->s_flag & cpu_to_le32(JFS_BAD_SAIT)) !=
+            cpu_to_le32(JFS_BAD_SAIT)) {
+                expected_AIM_bytesize = 2 * PSIZE;
+                AIM_bytesize = lengthPXD(&(j_sb->s_aim2)) * bsize;
+                expected_AIT_bytesize = 4 * PSIZE;
+                AIT_bytesize = lengthPXD(&(j_sb->s_ait2)) * bsize;
+                AIM_byte_addr = addressPXD(&(j_sb->s_aim2)) * bsize;
+                AIT_byte_addr = addressPXD(&(j_sb->s_ait2)) * bsize;
+                byte_addr_diff0 = AIT_byte_addr - AIM_byte_addr;
+                fsckwsp_addr = addressPXD(&(j_sb->s_fsckpxd)) * bsize;
+                byte_addr_diff1 = fsckwsp_addr - AIT_byte_addr;
+                if ((AIM_bytesize != expected_AIM_bytesize) ||
+                    (AIT_bytesize != expected_AIT_bytesize) ||
+                    (byte_addr_diff0 != AIM_bytesize) ||
+                    (byte_addr_diff1 <= AIT_bytesize))
+                        j_sb->s_flag |= cpu_to_le32(JFS_BAD_SAIT);
+        }
+        if ((j_sb->s_flag & cpu_to_le32(JFS_GROUPCOMMIT)) !=
+            cpu_to_le32(JFS_GROUPCOMMIT))
+                j_sb->s_flag |= cpu_to_le32(JFS_GROUPCOMMIT);
+        /* validate fs state */
+        if (j_sb->s_state != cpu_to_le32(FM_CLEAN) &&
+            !(sb->s_flags & MS_RDONLY)) {
+                jfs_err("jfs_mount: Mount Failure: File System Dirty.");
+                rc = -EINVAL;
+                goto out;
+        }
+        sbi->state = le32_to_cpu(j_sb->s_state);
+        sbi->mntflag = le32_to_cpu(j_sb->s_flag);
+        /*
+         * JFS always does I/O by 4K pages.  Don't tell the buffer cache
+         * that we use anything else (leave s_blocksize alone).
+         */
+        sbi->bsize = bsize;
+        sbi->l2bsize = le16_to_cpu(j_sb->s_l2bsize);
+        /*
+         * For now, ignore s_pbsize, l2bfactor.  All I/O going through buffer
+         * cache.
+         */
+        sbi->nbperpage = PSIZE >> sbi->l2bsize;
+        sbi->l2nbperpage = L2PSIZE - sbi->l2bsize;
+        sbi->l2niperblk = sbi->l2bsize - L2DISIZE;
+        if (sbi->mntflag & JFS_INLINELOG)
+                sbi->logpxd = j_sb->s_logpxd;
+        else {
+                sbi->logdev = new_decode_dev(le32_to_cpu(j_sb->s_logdev));
+                memcpy(sbi->uuid, j_sb->s_uuid, sizeof(sbi->uuid));
+                memcpy(sbi->loguuid, j_sb->s_loguuid, sizeof(sbi->uuid));
+        }
+        sbi->fsckpxd = j_sb->s_fsckpxd;
+        sbi->ait2 = j_sb->s_ait2;
+      out:
+        brelse(bh);
+        return rc;
+}
+/*
+ *      updateSuper()
+ *
+ * update synchronously superblock if it is mounted read-write.
+ */
+int updateSuper(struct super_block *sb, uint state)
+{
+        struct jfs_superblock *j_sb;
+        struct jfs_sb_info *sbi = JFS_SBI(sb);
+        struct buffer_head *bh;
+        int rc;
+        if (sbi->flag & JFS_NOINTEGRITY) {
+                if (state == FM_DIRTY) {
+                        sbi->p_state = state;
+                        return 0;
+                } else if (state == FM_MOUNT) {
+                        sbi->p_state = sbi->state;
+                        state = FM_DIRTY;
+                } else if (state == FM_CLEAN) {
+                        state = sbi->p_state;
+                } else
+                        jfs_err("updateSuper: bad state");
+        } else if (sbi->state == FM_DIRTY)
+                return 0;
+        
+        if ((rc = readSuper(sb, &bh)))
+                return rc;
+        j_sb = (struct jfs_superblock *)bh->b_data;
+        j_sb->s_state = cpu_to_le32(state);
+        sbi->state = state;
+        if (state == FM_MOUNT) {
+                /* record log's dev_t and mount serial number */
+                j_sb->s_logdev = cpu_to_le32(new_encode_dev(sbi->log->bdev->bd_dev));
+                j_sb->s_logserial = cpu_to_le32(sbi->log->serial);
+        } else if (state == FM_CLEAN) {
+                /*
+                 * If this volume is shared with OS/2, OS/2 will need to
+                 * recalculate DASD usage, since we don't deal with it.
+                 */
+                if (j_sb->s_flag & cpu_to_le32(JFS_DASD_ENABLED))
+                        j_sb->s_flag |= cpu_to_le32(JFS_DASD_PRIME);
+        }
+        mark_buffer_dirty(bh);
+        sync_dirty_buffer(bh);
+        brelse(bh);
+        return 0;
+}
+/*
+ *      readSuper()
+ *
+ * read superblock by raw sector address
+ */
+int readSuper(struct super_block *sb, struct buffer_head **bpp)
+{
+        /* read in primary superblock */
+        *bpp = sb_bread(sb, SUPER1_OFF >> sb->s_blocksize_bits);
+        if (*bpp)
+                return 0;
+        /* read in secondary/replicated superblock */
+        *bpp = sb_bread(sb, SUPER2_OFF >> sb->s_blocksize_bits);
+        if (*bpp)
+                return 0;
+        return -EIO;
+}
+/*
+ *      logMOUNT()
+ *
+ * function: write a MOUNT log record for file system.
+ *
+ * MOUNT record keeps logredo() from processing log records
+ * for this file system past this point in log.
+ * it is harmless if mount fails.
+ *
+ * note: MOUNT record is at aggregate level, not at fileset level, 
+ * since log records of previous mounts of a fileset
+ * (e.g., AFTER record of extent allocation) have to be processed 
+ * to update block allocation map at aggregate level.
+ */
+static int logMOUNT(struct super_block *sb)
+{
+        struct jfs_log *log = JFS_SBI(sb)->log;
+        struct lrd lrd;
+        lrd.logtid = 0;
+        lrd.backchain = 0;
+        lrd.type = cpu_to_le16(LOG_MOUNT);
+        lrd.length = 0;
+        lrd.aggregate = cpu_to_le32(new_encode_dev(sb->s_bdev->bd_dev));
+        lmLog(log, NULL, &lrd, NULL);
+        return 0;
+}
diff --git a/fs/jfs/jfs_superblock.h b/fs/jfs/jfs_superblock.h
new file mode 100644
index 000000000000..ab0566f70cfa
--- /dev/null
+++ b/fs/jfs/jfs_superblock.h
@@ -0,0 +1,113 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2003
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_SUPERBLOCK
+#define _H_JFS_SUPERBLOCK
+/*
+ * make the magic number something a human could read
+ */
+#define JFS_MAGIC       "JFS1"  /* Magic word */
+#define JFS_VERSION     2       /* Version number: Version 2 */
+#define LV_NAME_SIZE    11      /* MUST BE 11 for OS/2 boot sector */
+/* 
+ *      aggregate superblock 
+ *
+ * The name superblock is too close to super_block, so the name has been
+ * changed to jfs_superblock.  The utilities are still using the old name.
+ */
+struct jfs_superblock {
+        char s_magic[4];        /* 4: magic number */
+        __le32 s_version;       /* 4: version number */
+        __le64 s_size;          /* 8: aggregate size in hardware/LVM blocks;
+                                 * VFS: number of blocks
+                                 */
+        __le32 s_bsize;         /* 4: aggregate block size in bytes; 
+                                 * VFS: fragment size
+                                 */
+        __le16 s_l2bsize;       /* 2: log2 of s_bsize */
+        __le16 s_l2bfactor;     /* 2: log2(s_bsize/hardware block size) */
+        __le32 s_pbsize;        /* 4: hardware/LVM block size in bytes */
+        __le16 s_l2pbsize;      /* 2: log2 of s_pbsize */
+        __le16 pad;             /* 2: padding necessary for alignment */
+        __le32 s_agsize;        /* 4: allocation group size in aggr. blocks */
+        __le32 s_flag;          /* 4: aggregate attributes:
+                                 *    see jfs_filsys.h
+                                 */
+        __le32 s_state;         /* 4: mount/unmount/recovery state: 
+                                 *    see jfs_filsys.h
+                                 */
+        __le32 s_compress;              /* 4: > 0 if data compression */
+        pxd_t s_ait2;           /* 8: first extent of secondary
+                                 *    aggregate inode table
+                                 */
+        pxd_t s_aim2;           /* 8: first extent of secondary
+                                 *    aggregate inode map
+                                 */
+        __le32 s_logdev;                /* 4: device address of log */
+        __le32 s_logserial;     /* 4: log serial number at aggregate mount */
+        pxd_t s_logpxd;         /* 8: inline log extent */
+        pxd_t s_fsckpxd;        /* 8: inline fsck work space extent */
+        struct timestruc_t s_time;      /* 8: time last updated */
+        __le32 s_fsckloglen;    /* 4: Number of filesystem blocks reserved for
+                                 *    the fsck service log.  
+                                 *    N.B. These blocks are divided among the
+                                 *         versions kept.  This is not a per
+                                 *         version size.
+                                 *    N.B. These blocks are included in the 
+                                 *         length field of s_fsckpxd.
+                                 */
+        s8 s_fscklog;           /* 1: which fsck service log is most recent
+                                 *    0 => no service log data yet
+                                 *    1 => the first one
+                                 *    2 => the 2nd one
+                                 */
+        char s_fpack[11];       /* 11: file system volume name 
+                                 *     N.B. This must be 11 bytes to
+                                 *          conform with the OS/2 BootSector
+                                 *          requirements
+                                 *          Only used when s_version is 1
+                                 */
+        /* extendfs() parameter under s_state & FM_EXTENDFS */
+        __le64 s_xsize;         /* 8: extendfs s_size */
+        pxd_t s_xfsckpxd;       /* 8: extendfs fsckpxd */
+        pxd_t s_xlogpxd;        /* 8: extendfs logpxd */
+        /* - 128 byte boundary - */
+        char s_uuid[16];        /* 16: 128-bit uuid for volume */
+        char s_label[16];       /* 16: volume label */
+        char s_loguuid[16];     /* 16: 128-bit uuid for log device */
+};
+extern int readSuper(struct super_block *, struct buffer_head **);
+extern int updateSuper(struct super_block *, uint);
+extern void jfs_error(struct super_block *, const char *, ...);
+#endif /*_H_JFS_SUPERBLOCK */
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
new file mode 100644
index 000000000000..f40301d93f74
--- /dev/null
+++ b/fs/jfs/jfs_txnmgr.c
@@ -0,0 +1,3131 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2005
+ *   Portions Copyright (C) Christoph Hellwig, 2001-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+/*
+ *      jfs_txnmgr.c: transaction manager
+ *
+ * notes:
+ * transaction starts with txBegin() and ends with txCommit()
+ * or txAbort().
+ *
+ * tlock is acquired at the time of update;
+ * (obviate scan at commit time for xtree and dtree)
+ * tlock and mp points to each other;
+ * (no hashlist for mp -> tlock).
+ *
+ * special cases:
+ * tlock on in-memory inode:
+ * in-place tlock in the in-memory inode itself;
+ * converted to page lock by iWrite() at commit time.
+ *
+ * tlock during write()/mmap() under anonymous transaction (tid = 0):
+ * transferred (?) to transaction at commit time.
+ *
+ * use the page itself to update allocation maps
+ * (obviate intermediate replication of allocation/deallocation data)
+ * hold on to mp+lock thru update of maps
+ */
+#include <linux/fs.h>
+#include <linux/vmalloc.h>
+#include <linux/smp_lock.h>
+#include <linux/completion.h>
+#include <linux/suspend.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include "jfs_incore.h"
+#include "jfs_filsys.h"
+#include "jfs_metapage.h"
+#include "jfs_dinode.h"
+#include "jfs_imap.h"
+#include "jfs_dmap.h"
+#include "jfs_superblock.h"
+#include "jfs_debug.h"
+/*
+ *      transaction management structures
+ */
+static struct {
+        int freetid;            /* index of a free tid structure */
+        int freelock;           /* index first free lock word */
+        wait_queue_head_t freewait;     /* eventlist of free tblock */
+        wait_queue_head_t freelockwait; /* eventlist of free tlock */
+        wait_queue_head_t lowlockwait;  /* eventlist of ample tlocks */
+        int tlocksInUse;        /* Number of tlocks in use */
+        spinlock_t LazyLock;    /* synchronize sync_queue & unlock_queue */
+/*      struct tblock *sync_queue; * Transactions waiting for data sync */
+        struct list_head unlock_queue;  /* Txns waiting to be released */
+        struct list_head anon_list;     /* inodes having anonymous txns */
+        struct list_head anon_list2;    /* inodes having anonymous txns
+                                           that couldn't be sync'ed */
+} TxAnchor;
+int jfs_tlocks_low;             /* Indicates low number of available tlocks */
+#ifdef CONFIG_JFS_STATISTICS
+static struct {
+        uint txBegin;
+        uint txBegin_barrier;
+        uint txBegin_lockslow;
+        uint txBegin_freetid;
+        uint txBeginAnon;
+        uint txBeginAnon_barrier;
+        uint txBeginAnon_lockslow;
+        uint txLockAlloc;
+        uint txLockAlloc_freelock;
+} TxStat;
+#endif
+static int nTxBlock = -1;       /* number of transaction blocks */
+module_param(nTxBlock, int, 0);
+MODULE_PARM_DESC(nTxBlock,
+                 "Number of transaction blocks (max:65536)");
+static int nTxLock = -1;        /* number of transaction locks */
+module_param(nTxLock, int, 0);
+MODULE_PARM_DESC(nTxLock,
+                 "Number of transaction locks (max:65536)");
+struct tblock *TxBlock;         /* transaction block table */
+static int TxLockLWM;           /* Low water mark for number of txLocks used */
+static int TxLockHWM;           /* High water mark for number of txLocks used */
+static int TxLockVHWM;          /* Very High water mark */
+struct tlock *TxLock;           /* transaction lock table */
+/*
+ *      transaction management lock
+ */
+static DEFINE_SPINLOCK(jfsTxnLock);
+#define TXN_LOCK()              spin_lock(&jfsTxnLock)
+#define TXN_UNLOCK()            spin_unlock(&jfsTxnLock)
+#define LAZY_LOCK_INIT()        spin_lock_init(&TxAnchor.LazyLock);
+#define LAZY_LOCK(flags)        spin_lock_irqsave(&TxAnchor.LazyLock, flags)
+#define LAZY_UNLOCK(flags) spin_unlock_irqrestore(&TxAnchor.LazyLock, flags)
+DECLARE_WAIT_QUEUE_HEAD(jfs_sync_thread_wait);
+DECLARE_WAIT_QUEUE_HEAD(jfs_commit_thread_wait);
+static int jfs_commit_thread_waking;
+/*
+ * Retry logic exist outside these macros to protect from spurrious wakeups.
+ */
+static inline void TXN_SLEEP_DROP_LOCK(wait_queue_head_t * event)
+{
+        DECLARE_WAITQUEUE(wait, current);
+        add_wait_queue(event, &wait);
+        set_current_state(TASK_UNINTERRUPTIBLE);
+        TXN_UNLOCK();
+        schedule();
+        current->state = TASK_RUNNING;
+        remove_wait_queue(event, &wait);
+}
+#define TXN_SLEEP(event)\
+{\
+        TXN_SLEEP_DROP_LOCK(event);\
+        TXN_LOCK();\
+}
+#define TXN_WAKEUP(event) wake_up_all(event)
+/*
+ *      statistics
+ */
+static struct {
+        tid_t maxtid;           /* 4: biggest tid ever used */
+        lid_t maxlid;           /* 4: biggest lid ever used */
+        int ntid;               /* 4: # of transactions performed */
+        int nlid;               /* 4: # of tlocks acquired */
+        int waitlock;           /* 4: # of tlock wait */
+} stattx;
+/*
+ * external references
+ */
+extern int lmGroupCommit(struct jfs_log *, struct tblock *);
+extern int jfs_commit_inode(struct inode *, int);
+extern int jfs_stop_threads;
+extern struct completion jfsIOwait;
+/*
+ * forward references
+ */
+static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
+                struct tlock * tlck, struct commit * cd);
+static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
+                struct tlock * tlck);
+static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
+                struct tlock * tlck);
+static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
+                struct tlock * tlck);
+static void txAllocPMap(struct inode *ip, struct maplock * maplock,
+                struct tblock * tblk);
+static void txForce(struct tblock * tblk);
+static int txLog(struct jfs_log * log, struct tblock * tblk,
+                struct commit * cd);
+static void txUpdateMap(struct tblock * tblk);
+static void txRelease(struct tblock * tblk);
+static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
+           struct tlock * tlck);
+static void LogSyncRelease(struct metapage * mp);
+/*
+ *              transaction block/lock management
+ *              ---------------------------------
+ */
+/*
+ * Get a transaction lock from the free list.  If the number in use is
+ * greater than the high water mark, wake up the sync daemon.  This should
+ * free some anonymous transaction locks.  (TXN_LOCK must be held.)
+ */
+static lid_t txLockAlloc(void)
+{
+        lid_t lid;
+        INCREMENT(TxStat.txLockAlloc);
+        if (!TxAnchor.freelock) {
+                INCREMENT(TxStat.txLockAlloc_freelock);
+        }
+        while (!(lid = TxAnchor.freelock))
+                TXN_SLEEP(&TxAnchor.freelockwait);
+        TxAnchor.freelock = TxLock[lid].next;
+        HIGHWATERMARK(stattx.maxlid, lid);
+        if ((++TxAnchor.tlocksInUse > TxLockHWM) && (jfs_tlocks_low == 0)) {
+                jfs_info("txLockAlloc tlocks low");
+                jfs_tlocks_low = 1;
+                wake_up(&jfs_sync_thread_wait);
+        }
+        return lid;
+}
+static void txLockFree(lid_t lid)
+{
+        TxLock[lid].next = TxAnchor.freelock;
+        TxAnchor.freelock = lid;
+        TxAnchor.tlocksInUse--;
+        if (jfs_tlocks_low && (TxAnchor.tlocksInUse < TxLockLWM)) {
+                jfs_info("txLockFree jfs_tlocks_low no more");
+                jfs_tlocks_low = 0;
+                TXN_WAKEUP(&TxAnchor.lowlockwait);
+        }
+        TXN_WAKEUP(&TxAnchor.freelockwait);
+}
+/*
+ * NAME:        txInit()
+ *
+ * FUNCTION:    initialize transaction management structures
+ *
+ * RETURN:
+ *
+ * serialization: single thread at jfs_init()
+ */
+int txInit(void)
+{
+        int k, size;
+        struct sysinfo si;
+        /* Set defaults for nTxLock and nTxBlock if unset */
+        if (nTxLock == -1) {
+                if (nTxBlock == -1) {
+                        /* Base default on memory size */
+                        si_meminfo(&si);
+                        if (si.totalram > (256 * 1024)) /* 1 GB */
+                                nTxLock = 64 * 1024;
+                        else
+                                nTxLock = si.totalram >> 2;
+                } else if (nTxBlock > (8 * 1024))
+                        nTxLock = 64 * 1024;
+                else
+                        nTxLock = nTxBlock << 3;
+        }
+        if (nTxBlock == -1)
+                nTxBlock = nTxLock >> 3;
+        /* Verify tunable parameters */
+        if (nTxBlock < 16)
+                nTxBlock = 16;  /* No one should set it this low */
+        if (nTxBlock > 65536)
+                nTxBlock = 65536;
+        if (nTxLock < 256)
+                nTxLock = 256;  /* No one should set it this low */
+        if (nTxLock > 65536)
+                nTxLock = 65536;
+        printk(KERN_INFO "JFS: nTxBlock = %d, nTxLock = %d\n",
+               nTxBlock, nTxLock);
+        /*
+         * initialize transaction block (tblock) table
+         *
+         * transaction id (tid) = tblock index
+         * tid = 0 is reserved.
+         */
+        TxLockLWM = (nTxLock * 4) / 10;
+        TxLockHWM = (nTxLock * 7) / 10;
+        TxLockVHWM = (nTxLock * 8) / 10;
+        size = sizeof(struct tblock) * nTxBlock;
+        TxBlock = (struct tblock *) vmalloc(size);
+        if (TxBlock == NULL)
+                return -ENOMEM;
+        for (k = 1; k < nTxBlock - 1; k++) {
+                TxBlock[k].next = k + 1;
+                init_waitqueue_head(&TxBlock[k].gcwait);
+                init_waitqueue_head(&TxBlock[k].waitor);
+        }
+        TxBlock[k].next = 0;
+        init_waitqueue_head(&TxBlock[k].gcwait);
+        init_waitqueue_head(&TxBlock[k].waitor);
+        TxAnchor.freetid = 1;
+        init_waitqueue_head(&TxAnchor.freewait);
+        stattx.maxtid = 1;      /* statistics */
+        /*
+         * initialize transaction lock (tlock) table
+         *
+         * transaction lock id = tlock index
+         * tlock id = 0 is reserved.
+         */
+        size = sizeof(struct tlock) * nTxLock;
+        TxLock = (struct tlock *) vmalloc(size);
+        if (TxLock == NULL) {
+                vfree(TxBlock);
+                return -ENOMEM;
+        }
+        /* initialize tlock table */
+        for (k = 1; k < nTxLock - 1; k++)
+                TxLock[k].next = k + 1;
+        TxLock[k].next = 0;
+        init_waitqueue_head(&TxAnchor.freelockwait);
+        init_waitqueue_head(&TxAnchor.lowlockwait);
+        TxAnchor.freelock = 1;
+        TxAnchor.tlocksInUse = 0;
+        INIT_LIST_HEAD(&TxAnchor.anon_list);
+        INIT_LIST_HEAD(&TxAnchor.anon_list2);
+        LAZY_LOCK_INIT();
+        INIT_LIST_HEAD(&TxAnchor.unlock_queue);
+        stattx.maxlid = 1;      /* statistics */
+        return 0;
+}
+/*
+ * NAME:        txExit()
+ *
+ * FUNCTION:    clean up when module is unloaded
+ */
+void txExit(void)
+{
+        vfree(TxLock);
+        TxLock = NULL;
+        vfree(TxBlock);
+        TxBlock = NULL;
+}
+/*
+ * NAME:        txBegin()
+ *
+ * FUNCTION:    start a transaction.
+ *
+ * PARAMETER:   sb      - superblock
+ *              flag    - force for nested tx;
+ *
+ * RETURN:      tid     - transaction id
+ *
+ * note: flag force allows to start tx for nested tx
+ * to prevent deadlock on logsync barrier;
+ */
+tid_t txBegin(struct super_block *sb, int flag)
+{
+        tid_t t;
+        struct tblock *tblk;
+        struct jfs_log *log;
+        jfs_info("txBegin: flag = 0x%x", flag);
+        log = JFS_SBI(sb)->log;
+        TXN_LOCK();
+        INCREMENT(TxStat.txBegin);
+      retry:
+        if (!(flag & COMMIT_FORCE)) {
+                /*
+                 * synchronize with logsync barrier
+                 */
+                if (test_bit(log_SYNCBARRIER, &log->flag) ||
+                    test_bit(log_QUIESCE, &log->flag)) {
+                        INCREMENT(TxStat.txBegin_barrier);
+                        TXN_SLEEP(&log->syncwait);
+                        goto retry;
+                }
+        }
+        if (flag == 0) {
+                /*
+                 * Don't begin transaction if we're getting starved for tlocks
+                 * unless COMMIT_FORCE or COMMIT_INODE (which may ultimately
+                 * free tlocks)
+                 */
+                if (TxAnchor.tlocksInUse > TxLockVHWM) {
+                        INCREMENT(TxStat.txBegin_lockslow);
+                        TXN_SLEEP(&TxAnchor.lowlockwait);
+                        goto retry;
+                }
+        }
+        /*
+         * allocate transaction id/block
+         */
+        if ((t = TxAnchor.freetid) == 0) {
+                jfs_info("txBegin: waiting for free tid");
+                INCREMENT(TxStat.txBegin_freetid);
+                TXN_SLEEP(&TxAnchor.freewait);
+                goto retry;
+        }
+        tblk = tid_to_tblock(t);
+        if ((tblk->next == 0) && !(flag & COMMIT_FORCE)) {
+                /* Don't let a non-forced transaction take the last tblk */
+                jfs_info("txBegin: waiting for free tid");
+                INCREMENT(TxStat.txBegin_freetid);
+                TXN_SLEEP(&TxAnchor.freewait);
+                goto retry;
+        }
+        TxAnchor.freetid = tblk->next;
+        /*
+         * initialize transaction
+         */
+        /*
+         * We can't zero the whole thing or we screw up another thread being
+         * awakened after sleeping on tblk->waitor
+         *
+         * memset(tblk, 0, sizeof(struct tblock));
+         */
+        tblk->next = tblk->last = tblk->xflag = tblk->flag = tblk->lsn = 0;
+        tblk->sb = sb;
+        ++log->logtid;
+        tblk->logtid = log->logtid;
+        ++log->active;
+        HIGHWATERMARK(stattx.maxtid, t);        /* statistics */
+        INCREMENT(stattx.ntid); /* statistics */
+        TXN_UNLOCK();
+        jfs_info("txBegin: returning tid = %d", t);
+        return t;
+}
+/*
+ * NAME:        txBeginAnon()
+ *
+ * FUNCTION:    start an anonymous transaction.
+ *              Blocks if logsync or available tlocks are low to prevent
+ *              anonymous tlocks from depleting supply.
+ *
+ * PARAMETER:   sb      - superblock
+ *
+ * RETURN:      none
+ */
+void txBeginAnon(struct super_block *sb)
+{
+        struct jfs_log *log;
+        log = JFS_SBI(sb)->log;
+        TXN_LOCK();
+        INCREMENT(TxStat.txBeginAnon);
+      retry:
+        /*
+         * synchronize with logsync barrier
+         */
+        if (test_bit(log_SYNCBARRIER, &log->flag) ||
+            test_bit(log_QUIESCE, &log->flag)) {
+                INCREMENT(TxStat.txBeginAnon_barrier);
+                TXN_SLEEP(&log->syncwait);
+                goto retry;
+        }
+        /*
+         * Don't begin transaction if we're getting starved for tlocks
+         */
+        if (TxAnchor.tlocksInUse > TxLockVHWM) {
+                INCREMENT(TxStat.txBeginAnon_lockslow);
+                TXN_SLEEP(&TxAnchor.lowlockwait);
+                goto retry;
+        }
+        TXN_UNLOCK();
+}
+/*
+ *      txEnd()
+ *
+ * function: free specified transaction block.
+ *
+ *      logsync barrier processing:
+ *
+ * serialization:
+ */
+void txEnd(tid_t tid)
+{
+        struct tblock *tblk = tid_to_tblock(tid);
+        struct jfs_log *log;
+        jfs_info("txEnd: tid = %d", tid);
+        TXN_LOCK();
+        /*
+         * wakeup transactions waiting on the page locked
+         * by the current transaction
+         */
+        TXN_WAKEUP(&tblk->waitor);
+        log = JFS_SBI(tblk->sb)->log;
+        /*
+         * Lazy commit thread can't free this guy until we mark it UNLOCKED,
+         * otherwise, we would be left with a transaction that may have been
+         * reused.
+         *
+         * Lazy commit thread will turn off tblkGC_LAZY before calling this
+         * routine.
+         */
+        if (tblk->flag & tblkGC_LAZY) {
+                jfs_info("txEnd called w/lazy tid: %d, tblk = 0x%p", tid, tblk);
+                TXN_UNLOCK();
+                spin_lock_irq(&log->gclock);    // LOGGC_LOCK
+                tblk->flag |= tblkGC_UNLOCKED;
+                spin_unlock_irq(&log->gclock);  // LOGGC_UNLOCK
+                return;
+        }
+        jfs_info("txEnd: tid: %d, tblk = 0x%p", tid, tblk);
+        assert(tblk->next == 0);
+        /*
+         * insert tblock back on freelist
+         */
+        tblk->next = TxAnchor.freetid;
+        TxAnchor.freetid = tid;
+        /*
+         * mark the tblock not active
+         */
+        if (--log->active == 0) {
+                clear_bit(log_FLUSH, &log->flag);
+                /*
+                 * synchronize with logsync barrier
+                 */
+                if (test_bit(log_SYNCBARRIER, &log->flag)) {
+                        /* forward log syncpt */
+                        /* lmSync(log); */
+                        jfs_info("log barrier off: 0x%x", log->lsn);
+                        /* enable new transactions start */
+                        clear_bit(log_SYNCBARRIER, &log->flag);
+                        /* wakeup all waitors for logsync barrier */
+                        TXN_WAKEUP(&log->syncwait);
+                }
+        }
+        /*
+         * wakeup all waitors for a free tblock
+         */
+        TXN_WAKEUP(&TxAnchor.freewait);
+        TXN_UNLOCK();
+}
+/*
+ *      txLock()
+ *
+ * function: acquire a transaction lock on the specified <mp>
+ *
+ * parameter:
+ *
+ * return:      transaction lock id
+ *
+ * serialization:
+ */
+struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage * mp,
+                     int type)
+{
+        struct jfs_inode_info *jfs_ip = JFS_IP(ip);
+        int dir_xtree = 0;
+        lid_t lid;
+        tid_t xtid;
+        struct tlock *tlck;
+        struct xtlock *xtlck;
+        struct linelock *linelock;
+        xtpage_t *p;
+        struct tblock *tblk;
+        TXN_LOCK();
+        if (S_ISDIR(ip->i_mode) && (type & tlckXTREE) &&
+            !(mp->xflag & COMMIT_PAGE)) {
+                /*
+                 * Directory inode is special.  It can have both an xtree tlock
+                 * and a dtree tlock associated with it.
+                 */
+                dir_xtree = 1;
+                lid = jfs_ip->xtlid;
+        } else
+                lid = mp->lid;
+        /* is page not locked by a transaction ? */
+        if (lid == 0)
+                goto allocateLock;
+        jfs_info("txLock: tid:%d ip:0x%p mp:0x%p lid:%d", tid, ip, mp, lid);
+        /* is page locked by the requester transaction ? */
+        tlck = lid_to_tlock(lid);
+        if ((xtid = tlck->tid) == tid)
+                goto grantLock;
+        /*
+         * is page locked by anonymous transaction/lock ?
+         *
+         * (page update without transaction (i.e., file write) is
+         * locked under anonymous transaction tid = 0:
+         * anonymous tlocks maintained on anonymous tlock list of
+         * the inode of the page and available to all anonymous
+         * transactions until txCommit() time at which point
+         * they are transferred to the transaction tlock list of
+         * the commiting transaction of the inode)
+         */
+        if (xtid == 0) {
+                tlck->tid = tid;
+                tblk = tid_to_tblock(tid);
+                /*
+                 * The order of the tlocks in the transaction is important
+                 * (during truncate, child xtree pages must be freed before
+                 * parent's tlocks change the working map).
+                 * Take tlock off anonymous list and add to tail of
+                 * transaction list
+                 *
+                 * Note:  We really need to get rid of the tid & lid and
+                 * use list_head's.  This code is getting UGLY!
+                 */
+                if (jfs_ip->atlhead == lid) {
+                        if (jfs_ip->atltail == lid) {
+                                /* only anonymous txn.
+                                 * Remove from anon_list
+                                 */
+                                list_del_init(&jfs_ip->anon_inode_list);
+                        }
+                        jfs_ip->atlhead = tlck->next;
+                } else {
+                        lid_t last;
+                        for (last = jfs_ip->atlhead;
+                             lid_to_tlock(last)->next != lid;
+                             last = lid_to_tlock(last)->next) {
+                                assert(last);
+                        }
+                        lid_to_tlock(last)->next = tlck->next;
+                        if (jfs_ip->atltail == lid)
+                                jfs_ip->atltail = last;
+                }
+                /* insert the tlock at tail of transaction tlock list */
+                if (tblk->next)
+                        lid_to_tlock(tblk->last)->next = lid;
+                else
+                        tblk->next = lid;
+                tlck->next = 0;
+                tblk->last = lid;
+                goto grantLock;
+        }
+        goto waitLock;
+        /*
+         * allocate a tlock
+         */
+      allocateLock:
+        lid = txLockAlloc();
+        tlck = lid_to_tlock(lid);
+        /*
+         * initialize tlock
+         */
+        tlck->tid = tid;
+        /* mark tlock for meta-data page */
+        if (mp->xflag & COMMIT_PAGE) {
+                tlck->flag = tlckPAGELOCK;
+                /* mark the page dirty and nohomeok */
+                mark_metapage_dirty(mp);
+                atomic_inc(&mp->nohomeok);
+                jfs_info("locking mp = 0x%p, nohomeok = %d tid = %d tlck = 0x%p",
+                         mp, atomic_read(&mp->nohomeok), tid, tlck);
+                /* if anonymous transaction, and buffer is on the group
+                 * commit synclist, mark inode to show this.  This will
+                 * prevent the buffer from being marked nohomeok for too
+                 * long a time.
+                 */
+                if ((tid == 0) && mp->lsn)
+                        set_cflag(COMMIT_Synclist, ip);
+        }
+        /* mark tlock for in-memory inode */
+        else
+                tlck->flag = tlckINODELOCK;
+        tlck->type = 0;
+        /* bind the tlock and the page */
+        tlck->ip = ip;
+        tlck->mp = mp;
+        if (dir_xtree)
+                jfs_ip->xtlid = lid;
+        else
+                mp->lid = lid;
+        /*
+         * enqueue transaction lock to transaction/inode
+         */
+        /* insert the tlock at tail of transaction tlock list */
+        if (tid) {
+                tblk = tid_to_tblock(tid);
+                if (tblk->next)
+                        lid_to_tlock(tblk->last)->next = lid;
+                else
+                        tblk->next = lid;
+                tlck->next = 0;
+                tblk->last = lid;
+        }
+        /* anonymous transaction:
+         * insert the tlock at head of inode anonymous tlock list
+         */
+        else {
+                tlck->next = jfs_ip->atlhead;
+                jfs_ip->atlhead = lid;
+                if (tlck->next == 0) {
+                        /* This inode's first anonymous transaction */
+                        jfs_ip->atltail = lid;
+                        list_add_tail(&jfs_ip->anon_inode_list,
+                                      &TxAnchor.anon_list);
+                }
+        }
+        /* initialize type dependent area for linelock */
+        linelock = (struct linelock *) & tlck->lock;
+        linelock->next = 0;
+        linelock->flag = tlckLINELOCK;
+        linelock->maxcnt = TLOCKSHORT;
+        linelock->index = 0;
+        switch (type & tlckTYPE) {
+        case tlckDTREE:
+                linelock->l2linesize = L2DTSLOTSIZE;
+                break;
+        case tlckXTREE:
+                linelock->l2linesize = L2XTSLOTSIZE;
+                xtlck = (struct xtlock *) linelock;
+                xtlck->header.offset = 0;
+                xtlck->header.length = 2;
+                if (type & tlckNEW) {
+                        xtlck->lwm.offset = XTENTRYSTART;
+                } else {
+                        if (mp->xflag & COMMIT_PAGE)
+                                p = (xtpage_t *) mp->data;
+                        else
+                                p = &jfs_ip->i_xtroot;
+                        xtlck->lwm.offset =
+                            le16_to_cpu(p->header.nextindex);
+                }
+                xtlck->lwm.length = 0;  /* ! */
+                xtlck->twm.offset = 0;
+                xtlck->hwm.offset = 0;
+                xtlck->index = 2;
+                break;
+        case tlckINODE:
+                linelock->l2linesize = L2INODESLOTSIZE;
+                break;
+        case tlckDATA:
+                linelock->l2linesize = L2DATASLOTSIZE;
+                break;
+        default:
+                jfs_err("UFO tlock:0x%p", tlck);
+        }
+        /*
+         * update tlock vector
+         */
+      grantLock:
+        tlck->type |= type;
+        TXN_UNLOCK();
+        return tlck;
+        /*
+         * page is being locked by another transaction:
+         */
+      waitLock:
+        /* Only locks on ipimap or ipaimap should reach here */
+        /* assert(jfs_ip->fileset == AGGREGATE_I); */
+        if (jfs_ip->fileset != AGGREGATE_I) {
+                jfs_err("txLock: trying to lock locked page!");
+                dump_mem("ip", ip, sizeof(struct inode));
+                dump_mem("mp", mp, sizeof(struct metapage));
+                dump_mem("Locker's tblk", tid_to_tblock(tid),
+                         sizeof(struct tblock));
+                dump_mem("Tlock", tlck, sizeof(struct tlock));
+                BUG();
+        }
+        INCREMENT(stattx.waitlock);     /* statistics */
+        release_metapage(mp);
+        jfs_info("txLock: in waitLock, tid = %d, xtid = %d, lid = %d",
+                 tid, xtid, lid);
+        TXN_SLEEP_DROP_LOCK(&tid_to_tblock(xtid)->waitor);
+        jfs_info("txLock: awakened     tid = %d, lid = %d", tid, lid);
+        return NULL;
+}
+/*
+ * NAME:        txRelease()
+ *
+ * FUNCTION:    Release buffers associated with transaction locks, but don't
+ *              mark homeok yet.  The allows other transactions to modify
+ *              buffers, but won't let them go to disk until commit record
+ *              actually gets written.
+ *
+ * PARAMETER:
+ *              tblk    -
+ *
+ * RETURN:      Errors from subroutines.
+ */
+static void txRelease(struct tblock * tblk)
+{
+        struct metapage *mp;
+        lid_t lid;
+        struct tlock *tlck;
+        TXN_LOCK();
+        for (lid = tblk->next; lid; lid = tlck->next) {
+                tlck = lid_to_tlock(lid);
+                if ((mp = tlck->mp) != NULL &&
+                    (tlck->type & tlckBTROOT) == 0) {
+                        assert(mp->xflag & COMMIT_PAGE);
+                        mp->lid = 0;
+                }
+        }
+        /*
+         * wakeup transactions waiting on a page locked
+         * by the current transaction
+         */
+        TXN_WAKEUP(&tblk->waitor);
+        TXN_UNLOCK();
+}
+/*
+ * NAME:        txUnlock()
+ *
+ * FUNCTION:    Initiates pageout of pages modified by tid in journalled
+ *              objects and frees their lockwords.
+ */
+static void txUnlock(struct tblock * tblk)
+{
+        struct tlock *tlck;
+        struct linelock *linelock;
+        lid_t lid, next, llid, k;
+        struct metapage *mp;
+        struct jfs_log *log;
+        int difft, diffp;
+        jfs_info("txUnlock: tblk = 0x%p", tblk);
+        log = JFS_SBI(tblk->sb)->log;
+        /*
+         * mark page under tlock homeok (its log has been written):
+         */
+        for (lid = tblk->next; lid; lid = next) {
+                tlck = lid_to_tlock(lid);
+                next = tlck->next;
+                jfs_info("unlocking lid = %d, tlck = 0x%p", lid, tlck);
+                /* unbind page from tlock */
+                if ((mp = tlck->mp) != NULL &&
+                    (tlck->type & tlckBTROOT) == 0) {
+                        assert(mp->xflag & COMMIT_PAGE);
+                        /* hold buffer
+                         *
+                         * It's possible that someone else has the metapage.
+                         * The only things were changing are nohomeok, which
+                         * is handled atomically, and clsn which is protected
+                         * by the LOGSYNC_LOCK.
+                         */
+                        hold_metapage(mp, 1);
+                        assert(atomic_read(&mp->nohomeok) > 0);
+                        atomic_dec(&mp->nohomeok);
+                        /* inherit younger/larger clsn */
+                        LOGSYNC_LOCK(log);
+                        if (mp->clsn) {
+                                logdiff(difft, tblk->clsn, log);
+                                logdiff(diffp, mp->clsn, log);
+                                if (difft > diffp)
+                                        mp->clsn = tblk->clsn;
+                        } else
+                                mp->clsn = tblk->clsn;
+                        LOGSYNC_UNLOCK(log);
+                        assert(!(tlck->flag & tlckFREEPAGE));
+                        if (tlck->flag & tlckWRITEPAGE) {
+                                write_metapage(mp);
+                        } else {
+                                /* release page which has been forced */
+                                release_metapage(mp);
+                        }
+                }
+                /* insert tlock, and linelock(s) of the tlock if any,
+                 * at head of freelist
+                 */
+                TXN_LOCK();
+                llid = ((struct linelock *) & tlck->lock)->next;
+                while (llid) {
+                        linelock = (struct linelock *) lid_to_tlock(llid);
+                        k = linelock->next;
+                        txLockFree(llid);
+                        llid = k;
+                }
+                txLockFree(lid);
+                TXN_UNLOCK();
+        }
+        tblk->next = tblk->last = 0;
+        /*
+         * remove tblock from logsynclist
+         * (allocation map pages inherited lsn of tblk and
+         * has been inserted in logsync list at txUpdateMap())
+         */
+        if (tblk->lsn) {
+                LOGSYNC_LOCK(log);
+                log->count--;
+                list_del(&tblk->synclist);
+                LOGSYNC_UNLOCK(log);
+        }
+}
+/*
+ *      txMaplock()
+ *
+ * function: allocate a transaction lock for freed page/entry;
+ *      for freed page, maplock is used as xtlock/dtlock type;
+ */
+struct tlock *txMaplock(tid_t tid, struct inode *ip, int type)
+{
+        struct jfs_inode_info *jfs_ip = JFS_IP(ip);
+        lid_t lid;
+        struct tblock *tblk;
+        struct tlock *tlck;
+        struct maplock *maplock;
+        TXN_LOCK();
+        /*
+         * allocate a tlock
+         */
+        lid = txLockAlloc();
+        tlck = lid_to_tlock(lid);
+        /*
+         * initialize tlock
+         */
+        tlck->tid = tid;
+        /* bind the tlock and the object */
+        tlck->flag = tlckINODELOCK;
+        tlck->ip = ip;
+        tlck->mp = NULL;
+        tlck->type = type;
+        /*
+         * enqueue transaction lock to transaction/inode
+         */
+        /* insert the tlock at tail of transaction tlock list */
+        if (tid) {
+                tblk = tid_to_tblock(tid);
+                if (tblk->next)
+                        lid_to_tlock(tblk->last)->next = lid;
+                else
+                        tblk->next = lid;
+                tlck->next = 0;
+                tblk->last = lid;
+        }
+        /* anonymous transaction:
+         * insert the tlock at head of inode anonymous tlock list
+         */
+        else {
+                tlck->next = jfs_ip->atlhead;
+                jfs_ip->atlhead = lid;
+                if (tlck->next == 0) {
+                        /* This inode's first anonymous transaction */
+                        jfs_ip->atltail = lid;
+                        list_add_tail(&jfs_ip->anon_inode_list,
+                                      &TxAnchor.anon_list);
+                }
+        }
+        TXN_UNLOCK();
+        /* initialize type dependent area for maplock */
+        maplock = (struct maplock *) & tlck->lock;
+        maplock->next = 0;
+        maplock->maxcnt = 0;
+        maplock->index = 0;
+        return tlck;
+}
+/*
+ *      txLinelock()
+ *
+ * function: allocate a transaction lock for log vector list
+ */
+struct linelock *txLinelock(struct linelock * tlock)
+{
+        lid_t lid;
+        struct tlock *tlck;
+        struct linelock *linelock;
+        TXN_LOCK();
+        /* allocate a TxLock structure */
+        lid = txLockAlloc();
+        tlck = lid_to_tlock(lid);
+        TXN_UNLOCK();
+        /* initialize linelock */
+        linelock = (struct linelock *) tlck;
+        linelock->next = 0;
+        linelock->flag = tlckLINELOCK;
+        linelock->maxcnt = TLOCKLONG;
+        linelock->index = 0;
+        /* append linelock after tlock */
+        linelock->next = tlock->next;
+        tlock->next = lid;
+        return linelock;
+}
+/*
+ *              transaction commit management
+ *              -----------------------------
+ */
+/*
+ * NAME:        txCommit()
+ *
+ * FUNCTION:    commit the changes to the objects specified in
+ *              clist.  For journalled segments only the
+ *              changes of the caller are committed, ie by tid.
+ *              for non-journalled segments the data are flushed to
+ *              disk and then the change to the disk inode and indirect
+ *              blocks committed (so blocks newly allocated to the
+ *              segment will be made a part of the segment atomically).
+ *
+ *              all of the segments specified in clist must be in
+ *              one file system. no more than 6 segments are needed
+ *              to handle all unix svcs.
+ *
+ *              if the i_nlink field (i.e. disk inode link count)
+ *              is zero, and the type of inode is a regular file or
+ *              directory, or symbolic link , the inode is truncated
+ *              to zero length. the truncation is committed but the
+ *              VM resources are unaffected until it is closed (see
+ *              iput and iclose).
+ *
+ * PARAMETER:
+ *
+ * RETURN:
+ *
+ * serialization:
+ *              on entry the inode lock on each segment is assumed
+ *              to be held.
+ *
+ * i/o error:
+ */
+int txCommit(tid_t tid,         /* transaction identifier */
+             int nip,           /* number of inodes to commit */
+             struct inode **iplist,     /* list of inode to commit */
+             int flag)
+{
+        int rc = 0;
+        struct commit cd;
+        struct jfs_log *log;
+        struct tblock *tblk;
+        struct lrd *lrd;
+        int lsn;
+        struct inode *ip;
+        struct jfs_inode_info *jfs_ip;
+        int k, n;
+        ino_t top;
+        struct super_block *sb;
+        jfs_info("txCommit, tid = %d, flag = %d", tid, flag);
+        /* is read-only file system ? */
+        if (isReadOnly(iplist[0])) {
+                rc = -EROFS;
+                goto TheEnd;
+        }
+        sb = cd.sb = iplist[0]->i_sb;
+        cd.tid = tid;
+        if (tid == 0)
+                tid = txBegin(sb, 0);
+        tblk = tid_to_tblock(tid);
+        /*
+         * initialize commit structure
+         */
+        log = JFS_SBI(sb)->log;
+        cd.log = log;
+        /* initialize log record descriptor in commit */
+        lrd = &cd.lrd;
+        lrd->logtid = cpu_to_le32(tblk->logtid);
+        lrd->backchain = 0;
+        tblk->xflag |= flag;
+        if ((flag & (COMMIT_FORCE | COMMIT_SYNC)) == 0)
+                tblk->xflag |= COMMIT_LAZY;
+        /*
+         *      prepare non-journaled objects for commit
+         *
+         * flush data pages of non-journaled file
+         * to prevent the file getting non-initialized disk blocks
+         * in case of crash.
+         * (new blocks - )
+         */
+        cd.iplist = iplist;
+        cd.nip = nip;
+        /*
+         *      acquire transaction lock on (on-disk) inodes
+         *
+         * update on-disk inode from in-memory inode
+         * acquiring transaction locks for AFTER records
+         * on the on-disk inode of file object
+         *
+         * sort the inodes array by inode number in descending order
+         * to prevent deadlock when acquiring transaction lock
+         * of on-disk inodes on multiple on-disk inode pages by
+         * multiple concurrent transactions
+         */
+        for (k = 0; k < cd.nip; k++) {
+                top = (cd.iplist[k])->i_ino;
+                for (n = k + 1; n < cd.nip; n++) {
+                        ip = cd.iplist[n];
+                        if (ip->i_ino > top) {
+                                top = ip->i_ino;
+                                cd.iplist[n] = cd.iplist[k];
+                                cd.iplist[k] = ip;
+                        }
+                }
+                ip = cd.iplist[k];
+                jfs_ip = JFS_IP(ip);
+                /*
+                 * BUGBUG - This code has temporarily been removed.  The
+                 * intent is to ensure that any file data is written before
+                 * the metadata is committed to the journal.  This prevents
+                 * uninitialized data from appearing in a file after the
+                 * journal has been replayed.  (The uninitialized data
+                 * could be sensitive data removed by another user.)
+                 *
+                 * The problem now is that we are holding the IWRITELOCK
+                 * on the inode, and calling filemap_fdatawrite on an
+                 * unmapped page will cause a deadlock in jfs_get_block.
+                 *
+                 * The long term solution is to pare down the use of
+                 * IWRITELOCK.  We are currently holding it too long.
+                 * We could also be smarter about which data pages need
+                 * to be written before the transaction is committed and
+                 * when we don't need to worry about it at all.
+                 *
+                 * if ((!S_ISDIR(ip->i_mode))
+                 *    && (tblk->flag & COMMIT_DELETE) == 0) {
+                 *      filemap_fdatawrite(ip->i_mapping);
+                 *      filemap_fdatawait(ip->i_mapping);
+                 * }
+                 */
+                /*
+                 * Mark inode as not dirty.  It will still be on the dirty
+                 * inode list, but we'll know not to commit it again unless
+                 * it gets marked dirty again
+                 */
+                clear_cflag(COMMIT_Dirty, ip);
+                /* inherit anonymous tlock(s) of inode */
+                if (jfs_ip->atlhead) {
+                        lid_to_tlock(jfs_ip->atltail)->next = tblk->next;
+                        tblk->next = jfs_ip->atlhead;
+                        if (!tblk->last)
+                                tblk->last = jfs_ip->atltail;
+                        jfs_ip->atlhead = jfs_ip->atltail = 0;
+                        TXN_LOCK();
+                        list_del_init(&jfs_ip->anon_inode_list);
+                        TXN_UNLOCK();
+                }
+                /*
+                 * acquire transaction lock on on-disk inode page
+                 * (become first tlock of the tblk's tlock list)
+                 */
+                if (((rc = diWrite(tid, ip))))
+                        goto out;
+        }
+        /*
+         *      write log records from transaction locks
+         *
+         * txUpdateMap() resets XAD_NEW in XAD.
+         */
+        if ((rc = txLog(log, tblk, &cd)))
+                goto TheEnd;
+        /*
+         * Ensure that inode isn't reused before
+         * lazy commit thread finishes processing
+         */
+        if (tblk->xflag & COMMIT_DELETE) {
+                atomic_inc(&tblk->u.ip->i_count);
+                /*
+                 * Avoid a rare deadlock
+                 *
+                 * If the inode is locked, we may be blocked in
+                 * jfs_commit_inode.  If so, we don't want the
+                 * lazy_commit thread doing the last iput() on the inode
+                 * since that may block on the locked inode.  Instead,
+                 * commit the transaction synchronously, so the last iput
+                 * will be done by the calling thread (or later)
+                 */
+                if (tblk->u.ip->i_state & I_LOCK)
+                        tblk->xflag &= ~COMMIT_LAZY;
+        }
+        ASSERT((!(tblk->xflag & COMMIT_DELETE)) ||
+               ((tblk->u.ip->i_nlink == 0) &&
+                !test_cflag(COMMIT_Nolink, tblk->u.ip)));
+        /*
+         *      write COMMIT log record
+         */
+        lrd->type = cpu_to_le16(LOG_COMMIT);
+        lrd->length = 0;
+        lsn = lmLog(log, tblk, lrd, NULL);
+        lmGroupCommit(log, tblk);
+        /*
+         *      - transaction is now committed -
+         */
+        /*
+         * force pages in careful update
+         * (imap addressing structure update)
+         */
+        if (flag & COMMIT_FORCE)
+                txForce(tblk);
+        /*
+         *      update allocation map.
+         *
+         * update inode allocation map and inode:
+         * free pager lock on memory object of inode if any.
+         * update  block allocation map.
+         *
+         * txUpdateMap() resets XAD_NEW in XAD.
+         */
+        if (tblk->xflag & COMMIT_FORCE)
+                txUpdateMap(tblk);
+        /*
+         *      free transaction locks and pageout/free pages
+         */
+        txRelease(tblk);
+        if ((tblk->flag & tblkGC_LAZY) == 0)
+                txUnlock(tblk);
+        /*
+         *      reset in-memory object state
+         */
+        for (k = 0; k < cd.nip; k++) {
+                ip = cd.iplist[k];
+                jfs_ip = JFS_IP(ip);
+                /*
+                 * reset in-memory inode state
+                 */
+                jfs_ip->bxflag = 0;
+                jfs_ip->blid = 0;
+        }
+      out:
+        if (rc != 0)
+                txAbort(tid, 1);
+      TheEnd:
+        jfs_info("txCommit: tid = %d, returning %d", tid, rc);
+        return rc;
+}
+/*
+ * NAME:        txLog()
+ *
+ * FUNCTION:    Writes AFTER log records for all lines modified
+ *              by tid for segments specified by inodes in comdata.
+ *              Code assumes only WRITELOCKS are recorded in lockwords.
+ *
+ * PARAMETERS:
+ *
+ * RETURN :
+ */
+static int txLog(struct jfs_log * log, struct tblock * tblk, struct commit * cd)
+{
+        int rc = 0;
+        struct inode *ip;
+        lid_t lid;
+        struct tlock *tlck;
+        struct lrd *lrd = &cd->lrd;
+        /*
+         * write log record(s) for each tlock of transaction,
+         */
+        for (lid = tblk->next; lid; lid = tlck->next) {
+                tlck = lid_to_tlock(lid);
+                tlck->flag |= tlckLOG;
+                /* initialize lrd common */
+                ip = tlck->ip;
+                lrd->aggregate = cpu_to_le32(JFS_SBI(ip->i_sb)->aggregate);
+                lrd->log.redopage.fileset = cpu_to_le32(JFS_IP(ip)->fileset);
+                lrd->log.redopage.inode = cpu_to_le32(ip->i_ino);
+                /* write log record of page from the tlock */
+                switch (tlck->type & tlckTYPE) {
+                case tlckXTREE:
+                        xtLog(log, tblk, lrd, tlck);
+                        break;
+                case tlckDTREE:
+                        dtLog(log, tblk, lrd, tlck);
+                        break;
+                case tlckINODE:
+                        diLog(log, tblk, lrd, tlck, cd);
+                        break;
+                case tlckMAP:
+                        mapLog(log, tblk, lrd, tlck);
+                        break;
+                case tlckDATA:
+                        dataLog(log, tblk, lrd, tlck);
+                        break;
+                default:
+                        jfs_err("UFO tlock:0x%p", tlck);
+                }
+        }
+        return rc;
+}
+/*
+ *      diLog()
+ *
+ * function:    log inode tlock and format maplock to update bmap;
+ */
+static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
+          struct tlock * tlck, struct commit * cd)
+{
+        int rc = 0;
+        struct metapage *mp;
+        pxd_t *pxd;
+        struct pxd_lock *pxdlock;
+        mp = tlck->mp;
+        /* initialize as REDOPAGE record format */
+        lrd->log.redopage.type = cpu_to_le16(LOG_INODE);
+        lrd->log.redopage.l2linesize = cpu_to_le16(L2INODESLOTSIZE);
+        pxd = &lrd->log.redopage.pxd;
+        /*
+         *      inode after image
+         */
+        if (tlck->type & tlckENTRY) {
+                /* log after-image for logredo(): */
+                lrd->type = cpu_to_le16(LOG_REDOPAGE);
+//              *pxd = mp->cm_pxd;
+                PXDaddress(pxd, mp->index);
+                PXDlength(pxd,
+                          mp->logical_size >> tblk->sb->s_blocksize_bits);
+                lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
+                /* mark page as homeward bound */
+                tlck->flag |= tlckWRITEPAGE;
+        } else if (tlck->type & tlckFREE) {
+                /*
+                 *      free inode extent
+                 *
+                 * (pages of the freed inode extent have been invalidated and
+                 * a maplock for free of the extent has been formatted at
+                 * txLock() time);
+                 *
+                 * the tlock had been acquired on the inode allocation map page
+                 * (iag) that specifies the freed extent, even though the map
+                 * page is not itself logged, to prevent pageout of the map
+                 * page before the log;
+                 */
+                /* log LOG_NOREDOINOEXT of the freed inode extent for
+                 * logredo() to start NoRedoPage filters, and to update
+                 * imap and bmap for free of the extent;
+                 */
+                lrd->type = cpu_to_le16(LOG_NOREDOINOEXT);
+                /*
+                 * For the LOG_NOREDOINOEXT record, we need
+                 * to pass the IAG number and inode extent
+                 * index (within that IAG) from which the
+                 * the extent being released.  These have been
+                 * passed to us in the iplist[1] and iplist[2].
+                 */
+                lrd->log.noredoinoext.iagnum =
+                    cpu_to_le32((u32) (size_t) cd->iplist[1]);
+                lrd->log.noredoinoext.inoext_idx =
+                    cpu_to_le32((u32) (size_t) cd->iplist[2]);
+                pxdlock = (struct pxd_lock *) & tlck->lock;
+                *pxd = pxdlock->pxd;
+                lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL));
+                /* update bmap */
+                tlck->flag |= tlckUPDATEMAP;
+                /* mark page as homeward bound */
+                tlck->flag |= tlckWRITEPAGE;
+        } else
+                jfs_err("diLog: UFO type tlck:0x%p", tlck);
+#ifdef  _JFS_WIP
+        /*
+         *      alloc/free external EA extent
+         *
+         * a maplock for txUpdateMap() to update bPWMAP for alloc/free
+         * of the extent has been formatted at txLock() time;
+         */
+        else {
+                assert(tlck->type & tlckEA);
+                /* log LOG_UPDATEMAP for logredo() to update bmap for
+                 * alloc of new (and free of old) external EA extent;
+                 */
+                lrd->type = cpu_to_le16(LOG_UPDATEMAP);
+                pxdlock = (struct pxd_lock *) & tlck->lock;
+                nlock = pxdlock->index;
+                for (i = 0; i < nlock; i++, pxdlock++) {
+                        if (pxdlock->flag & mlckALLOCPXD)
+                                lrd->log.updatemap.type =
+                                    cpu_to_le16(LOG_ALLOCPXD);
+                        else
+                                lrd->log.updatemap.type =
+                                    cpu_to_le16(LOG_FREEPXD);
+                        lrd->log.updatemap.nxd = cpu_to_le16(1);
+                        lrd->log.updatemap.pxd = pxdlock->pxd;
+                        lrd->backchain =
+                            cpu_to_le32(lmLog(log, tblk, lrd, NULL));
+                }
+                /* update bmap */
+                tlck->flag |= tlckUPDATEMAP;
+        }
+#endif                          /* _JFS_WIP */
+        return rc;
+}
+/*
+ *      dataLog()
+ *
+ * function:    log data tlock
+ */
+static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
+            struct tlock * tlck)
+{
+        struct metapage *mp;
+        pxd_t *pxd;
+        mp = tlck->mp;
+        /* initialize as REDOPAGE record format */
+        lrd->log.redopage.type = cpu_to_le16(LOG_DATA);
+        lrd->log.redopage.l2linesize = cpu_to_le16(L2DATASLOTSIZE);
+        pxd = &lrd->log.redopage.pxd;
+        /* log after-image for logredo(): */
+        lrd->type = cpu_to_le16(LOG_REDOPAGE);
+        if (jfs_dirtable_inline(tlck->ip)) {
+                /*
+                 * The table has been truncated, we've must have deleted
+                 * the last entry, so don't bother logging this
+                 */
+                mp->lid = 0;
+                hold_metapage(mp, 0);
+                atomic_dec(&mp->nohomeok);
+                discard_metapage(mp);
+                tlck->mp = NULL;
+                return 0;
+        }
+        PXDaddress(pxd, mp->index);
+        PXDlength(pxd, mp->logical_size >> tblk->sb->s_blocksize_bits);
+        lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
+        /* mark page as homeward bound */
+        tlck->flag |= tlckWRITEPAGE;
+        return 0;
+}
+/*
+ *      dtLog()
+ *
+ * function:    log dtree tlock and format maplock to update bmap;
+ */
+static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
+           struct tlock * tlck)
+{
+        struct metapage *mp;
+        struct pxd_lock *pxdlock;
+        pxd_t *pxd;
+        mp = tlck->mp;
+        /* initialize as REDOPAGE/NOREDOPAGE record format */
+        lrd->log.redopage.type = cpu_to_le16(LOG_DTREE);
+        lrd->log.redopage.l2linesize = cpu_to_le16(L2DTSLOTSIZE);
+        pxd = &lrd->log.redopage.pxd;
+        if (tlck->type & tlckBTROOT)
+                lrd->log.redopage.type |= cpu_to_le16(LOG_BTROOT);
+        /*
+         *      page extension via relocation: entry insertion;
+         *      page extension in-place: entry insertion;
+         *      new right page from page split, reinitialized in-line
+         *      root from root page split: entry insertion;
+         */
+        if (tlck->type & (tlckNEW | tlckEXTEND)) {
+                /* log after-image of the new page for logredo():
+                 * mark log (LOG_NEW) for logredo() to initialize
+                 * freelist and update bmap for alloc of the new page;
+                 */
+                lrd->type = cpu_to_le16(LOG_REDOPAGE);
+                if (tlck->type & tlckEXTEND)
+                        lrd->log.redopage.type |= cpu_to_le16(LOG_EXTEND);
+                else
+                        lrd->log.redopage.type |= cpu_to_le16(LOG_NEW);
+//              *pxd = mp->cm_pxd;
+                PXDaddress(pxd, mp->index);
+                PXDlength(pxd,
+                          mp->logical_size >> tblk->sb->s_blocksize_bits);
+                lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
+                /* format a maplock for txUpdateMap() to update bPMAP for
+                 * alloc of the new page;
+                 */
+                if (tlck->type & tlckBTROOT)
+                        return;
+                tlck->flag |= tlckUPDATEMAP;
+                pxdlock = (struct pxd_lock *) & tlck->lock;
+                pxdlock->flag = mlckALLOCPXD;
+                pxdlock->pxd = *pxd;
+                pxdlock->index = 1;
+                /* mark page as homeward bound */
+                tlck->flag |= tlckWRITEPAGE;
+                return;
+        }
+        /*
+         *      entry insertion/deletion,
+         *      sibling page link update (old right page before split);
+         */
+        if (tlck->type & (tlckENTRY | tlckRELINK)) {
+                /* log after-image for logredo(): */
+                lrd->type = cpu_to_le16(LOG_REDOPAGE);
+                PXDaddress(pxd, mp->index);
+                PXDlength(pxd,
+                          mp->logical_size >> tblk->sb->s_blocksize_bits);
+                lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
+                /* mark page as homeward bound */
+                tlck->flag |= tlckWRITEPAGE;
+                return;
+        }
+        /*
+         *      page deletion: page has been invalidated
+         *      page relocation: source extent
+         *
+         *      a maplock for free of the page has been formatted
+         *      at txLock() time);
+         */
+        if (tlck->type & (tlckFREE | tlckRELOCATE)) {
+                /* log LOG_NOREDOPAGE of the deleted page for logredo()
+                 * to start NoRedoPage filter and to update bmap for free
+                 * of the deletd page
+                 */
+                lrd->type = cpu_to_le16(LOG_NOREDOPAGE);
+                pxdlock = (struct pxd_lock *) & tlck->lock;
+                *pxd = pxdlock->pxd;
+                lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL));
+                /* a maplock for txUpdateMap() for free of the page
+                 * has been formatted at txLock() time;
+                 */
+                tlck->flag |= tlckUPDATEMAP;
+        }
+        return;
+}
+/*
+ *      xtLog()
+ *
+ * function:    log xtree tlock and format maplock to update bmap;
+ */
+static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
+           struct tlock * tlck)
+{
+        struct inode *ip;
+        struct metapage *mp;
+        xtpage_t *p;
+        struct xtlock *xtlck;
+        struct maplock *maplock;
+        struct xdlistlock *xadlock;
+        struct pxd_lock *pxdlock;
+        pxd_t *pxd;
+        int next, lwm, hwm;
+        ip = tlck->ip;
+        mp = tlck->mp;
+        /* initialize as REDOPAGE/NOREDOPAGE record format */
+        lrd->log.redopage.type = cpu_to_le16(LOG_XTREE);
+        lrd->log.redopage.l2linesize = cpu_to_le16(L2XTSLOTSIZE);
+        pxd = &lrd->log.redopage.pxd;
+        if (tlck->type & tlckBTROOT) {
+                lrd->log.redopage.type |= cpu_to_le16(LOG_BTROOT);
+                p = &JFS_IP(ip)->i_xtroot;
+                if (S_ISDIR(ip->i_mode))
+                        lrd->log.redopage.type |=
+                            cpu_to_le16(LOG_DIR_XTREE);
+        } else
+                p = (xtpage_t *) mp->data;
+        next = le16_to_cpu(p->header.nextindex);
+        xtlck = (struct xtlock *) & tlck->lock;
+        maplock = (struct maplock *) & tlck->lock;
+        xadlock = (struct xdlistlock *) maplock;
+        /*
+         *      entry insertion/extension;
+         *      sibling page link update (old right page before split);
+         */
+        if (tlck->type & (tlckNEW | tlckGROW | tlckRELINK)) {
+                /* log after-image for logredo():
+                 * logredo() will update bmap for alloc of new/extended
+                 * extents (XAD_NEW|XAD_EXTEND) of XAD[lwm:next) from
+                 * after-image of XADlist;
+                 * logredo() resets (XAD_NEW|XAD_EXTEND) flag when
+                 * applying the after-image to the meta-data page.
+                 */
+                lrd->type = cpu_to_le16(LOG_REDOPAGE);
+//              *pxd = mp->cm_pxd;
+                PXDaddress(pxd, mp->index);
+                PXDlength(pxd,
+                          mp->logical_size >> tblk->sb->s_blocksize_bits);
+                lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
+                /* format a maplock for txUpdateMap() to update bPMAP
+                 * for alloc of new/extended extents of XAD[lwm:next)
+                 * from the page itself;
+                 * txUpdateMap() resets (XAD_NEW|XAD_EXTEND) flag.
+                 */
+                lwm = xtlck->lwm.offset;
+                if (lwm == 0)
+                        lwm = XTPAGEMAXSLOT;
+                if (lwm == next)
+                        goto out;
+                if (lwm > next) {
+                        jfs_err("xtLog: lwm > next\n");
+                        goto out;
+                }
+                tlck->flag |= tlckUPDATEMAP;
+                xadlock->flag = mlckALLOCXADLIST;
+                xadlock->count = next - lwm;
+                if ((xadlock->count <= 2) && (tblk->xflag & COMMIT_LAZY)) {
+                        int i;
+                        /*
+                         * Lazy commit may allow xtree to be modified before
+                         * txUpdateMap runs.  Copy xad into linelock to
+                         * preserve correct data.
+                         */
+                        xadlock->xdlist = &xtlck->pxdlock;
+                        memcpy(xadlock->xdlist, &p->xad[lwm],
+                               sizeof(xad_t) * xadlock->count);
+                        for (i = 0; i < xadlock->count; i++)
+                                p->xad[lwm + i].flag &=
+                                    ~(XAD_NEW | XAD_EXTENDED);
+                } else {
+                        /*
+                         * xdlist will point to into inode's xtree, ensure
+                         * that transaction is not committed lazily.
+                         */
+                        xadlock->xdlist = &p->xad[lwm];
+                        tblk->xflag &= ~COMMIT_LAZY;
+                }
+                jfs_info("xtLog: alloc ip:0x%p mp:0x%p tlck:0x%p lwm:%d "
+                         "count:%d", tlck->ip, mp, tlck, lwm, xadlock->count);
+                maplock->index = 1;
+              out:
+                /* mark page as homeward bound */
+                tlck->flag |= tlckWRITEPAGE;
+                return;
+        }
+        /*
+         *      page deletion: file deletion/truncation (ref. xtTruncate())
+         *
+         * (page will be invalidated after log is written and bmap
+         * is updated from the page);
+         */
+        if (tlck->type & tlckFREE) {
+                /* LOG_NOREDOPAGE log for NoRedoPage filter:
+                 * if page free from file delete, NoRedoFile filter from
+                 * inode image of zero link count will subsume NoRedoPage
+                 * filters for each page;
+                 * if page free from file truncattion, write NoRedoPage
+                 * filter;
+                 *
+                 * upadte of block allocation map for the page itself:
+                 * if page free from deletion and truncation, LOG_UPDATEMAP
+                 * log for the page itself is generated from processing
+                 * its parent page xad entries;
+                 */
+                /* if page free from file truncation, log LOG_NOREDOPAGE
+                 * of the deleted page for logredo() to start NoRedoPage
+                 * filter for the page;
+                 */
+                if (tblk->xflag & COMMIT_TRUNCATE) {
+                        /* write NOREDOPAGE for the page */
+                        lrd->type = cpu_to_le16(LOG_NOREDOPAGE);
+                        PXDaddress(pxd, mp->index);
+                        PXDlength(pxd,
+                                  mp->logical_size >> tblk->sb->
+                                  s_blocksize_bits);
+                        lrd->backchain =
+                            cpu_to_le32(lmLog(log, tblk, lrd, NULL));
+                        if (tlck->type & tlckBTROOT) {
+                                /* Empty xtree must be logged */
+                                lrd->type = cpu_to_le16(LOG_REDOPAGE);
+                                lrd->backchain =
+                                    cpu_to_le32(lmLog(log, tblk, lrd, tlck));
+                        }
+                }
+                /* init LOG_UPDATEMAP of the freed extents
+                 * XAD[XTENTRYSTART:hwm) from the deleted page itself
+                 * for logredo() to update bmap;
+                 */
+                lrd->type = cpu_to_le16(LOG_UPDATEMAP);
+                lrd->log.updatemap.type = cpu_to_le16(LOG_FREEXADLIST);
+                xtlck = (struct xtlock *) & tlck->lock;
+                hwm = xtlck->hwm.offset;
+                lrd->log.updatemap.nxd =
+                    cpu_to_le16(hwm - XTENTRYSTART + 1);
+                /* reformat linelock for lmLog() */
+                xtlck->header.offset = XTENTRYSTART;
+                xtlck->header.length = hwm - XTENTRYSTART + 1;
+                xtlck->index = 1;
+                lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
+                /* format a maplock for txUpdateMap() to update bmap
+                 * to free extents of XAD[XTENTRYSTART:hwm) from the
+                 * deleted page itself;
+                 */
+                tlck->flag |= tlckUPDATEMAP;
+                xadlock->flag = mlckFREEXADLIST;
+                xadlock->count = hwm - XTENTRYSTART + 1;
+                if ((xadlock->count <= 2) && (tblk->xflag & COMMIT_LAZY)) {
+                        /*
+                         * Lazy commit may allow xtree to be modified before
+                         * txUpdateMap runs.  Copy xad into linelock to
+                         * preserve correct data.
+                         */
+                        xadlock->xdlist = &xtlck->pxdlock;
+                        memcpy(xadlock->xdlist, &p->xad[XTENTRYSTART],
+                               sizeof(xad_t) * xadlock->count);
+                } else {
+                        /*
+                         * xdlist will point to into inode's xtree, ensure
+                         * that transaction is not committed lazily.
+                         */
+                        xadlock->xdlist = &p->xad[XTENTRYSTART];
+                        tblk->xflag &= ~COMMIT_LAZY;
+                }
+                jfs_info("xtLog: free ip:0x%p mp:0x%p count:%d lwm:2",
+                         tlck->ip, mp, xadlock->count);
+                maplock->index = 1;
+                /* mark page as invalid */
+                if (((tblk->xflag & COMMIT_PWMAP) || S_ISDIR(ip->i_mode))
+                    && !(tlck->type & tlckBTROOT))
+                        tlck->flag |= tlckFREEPAGE;
+                /*
+                   else (tblk->xflag & COMMIT_PMAP)
+                   ? release the page;
+                 */
+                return;
+        }
+        /*
+         *      page/entry truncation: file truncation (ref. xtTruncate())
+         *
+         *     |----------+------+------+---------------|
+         *                |      |      |
+         *                |      |     hwm - hwm before truncation
+         *                |     next - truncation point
+         *               lwm - lwm before truncation
+         * header ?
+         */
+        if (tlck->type & tlckTRUNCATE) {
+                pxd_t tpxd;     /* truncated extent of xad */
+                int twm;
+                /*
+                 * For truncation the entire linelock may be used, so it would
+                 * be difficult to store xad list in linelock itself.
+                 * Therefore, we'll just force transaction to be committed
+                 * synchronously, so that xtree pages won't be changed before
+                 * txUpdateMap runs.
+                 */
+                tblk->xflag &= ~COMMIT_LAZY;
+                lwm = xtlck->lwm.offset;
+                if (lwm == 0)
+                        lwm = XTPAGEMAXSLOT;
+                hwm = xtlck->hwm.offset;
+                twm = xtlck->twm.offset;
+                /*
+                 *      write log records
+                 */
+                /* log after-image for logredo():
+                 *
+                 * logredo() will update bmap for alloc of new/extended
+                 * extents (XAD_NEW|XAD_EXTEND) of XAD[lwm:next) from
+                 * after-image of XADlist;
+                 * logredo() resets (XAD_NEW|XAD_EXTEND) flag when
+                 * applying the after-image to the meta-data page.
+                 */
+                lrd->type = cpu_to_le16(LOG_REDOPAGE);
+                PXDaddress(pxd, mp->index);
+                PXDlength(pxd, mp->logical_size >> tblk->sb->s_blocksize_bits);
+                lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
+                /*
+                 * truncate entry XAD[twm == next - 1]:
+                 */
+                if (twm == next - 1) {
+                        /* init LOG_UPDATEMAP for logredo() to update bmap for
+                         * free of truncated delta extent of the truncated
+                         * entry XAD[next - 1]:
+                         * (xtlck->pxdlock = truncated delta extent);
+                         */
+                        pxdlock = (struct pxd_lock *) & xtlck->pxdlock;
+                        /* assert(pxdlock->type & tlckTRUNCATE); */
+                        lrd->type = cpu_to_le16(LOG_UPDATEMAP);
+                        lrd->log.updatemap.type = cpu_to_le16(LOG_FREEPXD);
+                        lrd->log.updatemap.nxd = cpu_to_le16(1);
+                        lrd->log.updatemap.pxd = pxdlock->pxd;
+                        tpxd = pxdlock->pxd;    /* save to format maplock */
+                        lrd->backchain =
+                            cpu_to_le32(lmLog(log, tblk, lrd, NULL));
+                }
+                /*
+                 * free entries XAD[next:hwm]:
+                 */
+                if (hwm >= next) {
+                        /* init LOG_UPDATEMAP of the freed extents
+                         * XAD[next:hwm] from the deleted page itself
+                         * for logredo() to update bmap;
+                         */
+                        lrd->type = cpu_to_le16(LOG_UPDATEMAP);
+                        lrd->log.updatemap.type =
+                            cpu_to_le16(LOG_FREEXADLIST);
+                        xtlck = (struct xtlock *) & tlck->lock;
+                        hwm = xtlck->hwm.offset;
+                        lrd->log.updatemap.nxd =
+                            cpu_to_le16(hwm - next + 1);
+                        /* reformat linelock for lmLog() */
+                        xtlck->header.offset = next;
+                        xtlck->header.length = hwm - next + 1;
+                        xtlck->index = 1;
+                        lrd->backchain =
+                            cpu_to_le32(lmLog(log, tblk, lrd, tlck));
+                }
+                /*
+                 *      format maplock(s) for txUpdateMap() to update bmap
+                 */
+                maplock->index = 0;
+                /*
+                 * allocate entries XAD[lwm:next):
+                 */
+                if (lwm < next) {
+                        /* format a maplock for txUpdateMap() to update bPMAP
+                         * for alloc of new/extended extents of XAD[lwm:next)
+                         * from the page itself;
+                         * txUpdateMap() resets (XAD_NEW|XAD_EXTEND) flag.
+                         */
+                        tlck->flag |= tlckUPDATEMAP;
+                        xadlock->flag = mlckALLOCXADLIST;
+                        xadlock->count = next - lwm;
+                        xadlock->xdlist = &p->xad[lwm];
+                        jfs_info("xtLog: alloc ip:0x%p mp:0x%p count:%d "
+                                 "lwm:%d next:%d",
+                                 tlck->ip, mp, xadlock->count, lwm, next);
+                        maplock->index++;
+                        xadlock++;
+                }
+                /*
+                 * truncate entry XAD[twm == next - 1]:
+                 */
+                if (twm == next - 1) {
+                        struct pxd_lock *pxdlock;
+                        /* format a maplock for txUpdateMap() to update bmap
+                         * to free truncated delta extent of the truncated
+                         * entry XAD[next - 1];
+                         * (xtlck->pxdlock = truncated delta extent);
+                         */
+                        tlck->flag |= tlckUPDATEMAP;
+                        pxdlock = (struct pxd_lock *) xadlock;
+                        pxdlock->flag = mlckFREEPXD;
+                        pxdlock->count = 1;
+                        pxdlock->pxd = tpxd;
+                        jfs_info("xtLog: truncate ip:0x%p mp:0x%p count:%d "
+                                 "hwm:%d", ip, mp, pxdlock->count, hwm);
+                        maplock->index++;
+                        xadlock++;
+                }
+                /*
+                 * free entries XAD[next:hwm]:
+                 */
+                if (hwm >= next) {
+                        /* format a maplock for txUpdateMap() to update bmap
+                         * to free extents of XAD[next:hwm] from thedeleted
+                         * page itself;
+                         */
+                        tlck->flag |= tlckUPDATEMAP;
+                        xadlock->flag = mlckFREEXADLIST;
+                        xadlock->count = hwm - next + 1;
+                        xadlock->xdlist = &p->xad[next];
+                        jfs_info("xtLog: free ip:0x%p mp:0x%p count:%d "
+                                 "next:%d hwm:%d",
+                                 tlck->ip, mp, xadlock->count, next, hwm);
+                        maplock->index++;
+                }
+                /* mark page as homeward bound */
+                tlck->flag |= tlckWRITEPAGE;
+        }
+        return;
+}
+/*
+ *      mapLog()
+ *
+ * function:    log from maplock of freed data extents;
+ */
+void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
+            struct tlock * tlck)
+{
+        struct pxd_lock *pxdlock;
+        int i, nlock;
+        pxd_t *pxd;
+        /*
+         *      page relocation: free the source page extent
+         *
+         * a maplock for txUpdateMap() for free of the page
+         * has been formatted at txLock() time saving the src
+         * relocated page address;
+         */
+        if (tlck->type & tlckRELOCATE) {
+                /* log LOG_NOREDOPAGE of the old relocated page
+                 * for logredo() to start NoRedoPage filter;
+                 */
+                lrd->type = cpu_to_le16(LOG_NOREDOPAGE);
+                pxdlock = (struct pxd_lock *) & tlck->lock;
+                pxd = &lrd->log.redopage.pxd;
+                *pxd = pxdlock->pxd;
+                lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL));
+                /* (N.B. currently, logredo() does NOT update bmap
+                 * for free of the page itself for (LOG_XTREE|LOG_NOREDOPAGE);
+                 * if page free from relocation, LOG_UPDATEMAP log is
+                 * specifically generated now for logredo()
+                 * to update bmap for free of src relocated page;
+                 * (new flag LOG_RELOCATE may be introduced which will
+                 * inform logredo() to start NORedoPage filter and also
+                 * update block allocation map at the same time, thus
+                 * avoiding an extra log write);
+                 */
+                lrd->type = cpu_to_le16(LOG_UPDATEMAP);
+                lrd->log.updatemap.type = cpu_to_le16(LOG_FREEPXD);
+                lrd->log.updatemap.nxd = cpu_to_le16(1);
+                lrd->log.updatemap.pxd = pxdlock->pxd;
+                lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL));
+                /* a maplock for txUpdateMap() for free of the page
+                 * has been formatted at txLock() time;
+                 */
+                tlck->flag |= tlckUPDATEMAP;
+                return;
+        }
+        /*
+         * Otherwise it's not a relocate request
+         *
+         */
+        else {
+                /* log LOG_UPDATEMAP for logredo() to update bmap for
+                 * free of truncated/relocated delta extent of the data;
+                 * e.g.: external EA extent, relocated/truncated extent
+                 * from xtTailgate();
+                 */
+                lrd->type = cpu_to_le16(LOG_UPDATEMAP);
+                pxdlock = (struct pxd_lock *) & tlck->lock;
+                nlock = pxdlock->index;
+                for (i = 0; i < nlock; i++, pxdlock++) {
+                        if (pxdlock->flag & mlckALLOCPXD)
+                                lrd->log.updatemap.type =
+                                    cpu_to_le16(LOG_ALLOCPXD);
+                        else
+                                lrd->log.updatemap.type =
+                                    cpu_to_le16(LOG_FREEPXD);
+                        lrd->log.updatemap.nxd = cpu_to_le16(1);
+                        lrd->log.updatemap.pxd = pxdlock->pxd;
+                        lrd->backchain =
+                            cpu_to_le32(lmLog(log, tblk, lrd, NULL));
+                        jfs_info("mapLog: xaddr:0x%lx xlen:0x%x",
+                                 (ulong) addressPXD(&pxdlock->pxd),
+                                 lengthPXD(&pxdlock->pxd));
+                }
+                /* update bmap */
+                tlck->flag |= tlckUPDATEMAP;
+        }
+}
+/*
+ *      txEA()
+ *
+ * function:    acquire maplock for EA/ACL extents or
+ *              set COMMIT_INLINE flag;
+ */
+void txEA(tid_t tid, struct inode *ip, dxd_t * oldea, dxd_t * newea)
+{
+        struct tlock *tlck = NULL;
+        struct pxd_lock *maplock = NULL, *pxdlock = NULL;
+        /*
+         * format maplock for alloc of new EA extent
+         */
+        if (newea) {
+                /* Since the newea could be a completely zeroed entry we need to
+                 * check for the two flags which indicate we should actually
+                 * commit new EA data
+                 */
+                if (newea->flag & DXD_EXTENT) {
+                        tlck = txMaplock(tid, ip, tlckMAP);
+                        maplock = (struct pxd_lock *) & tlck->lock;
+                        pxdlock = (struct pxd_lock *) maplock;
+                        pxdlock->flag = mlckALLOCPXD;
+                        PXDaddress(&pxdlock->pxd, addressDXD(newea));
+                        PXDlength(&pxdlock->pxd, lengthDXD(newea));
+                        pxdlock++;
+                        maplock->index = 1;
+                } else if (newea->flag & DXD_INLINE) {
+                        tlck = NULL;
+                        set_cflag(COMMIT_Inlineea, ip);
+                }
+        }
+        /*
+         * format maplock for free of old EA extent
+         */
+        if (!test_cflag(COMMIT_Nolink, ip) && oldea->flag & DXD_EXTENT) {
+                if (tlck == NULL) {
+                        tlck = txMaplock(tid, ip, tlckMAP);
+                        maplock = (struct pxd_lock *) & tlck->lock;
+                        pxdlock = (struct pxd_lock *) maplock;
+                        maplock->index = 0;
+                }
+                pxdlock->flag = mlckFREEPXD;
+                PXDaddress(&pxdlock->pxd, addressDXD(oldea));
+                PXDlength(&pxdlock->pxd, lengthDXD(oldea));
+                maplock->index++;
+        }
+}
+/*
+ *      txForce()
+ *
+ * function: synchronously write pages locked by transaction
+ *              after txLog() but before txUpdateMap();
+ */
+void txForce(struct tblock * tblk)
+{
+        struct tlock *tlck;
+        lid_t lid, next;
+        struct metapage *mp;
+        /*
+         * reverse the order of transaction tlocks in
+         * careful update order of address index pages
+         * (right to left, bottom up)
+         */
+        tlck = lid_to_tlock(tblk->next);
+        lid = tlck->next;
+        tlck->next = 0;
+        while (lid) {
+                tlck = lid_to_tlock(lid);
+                next = tlck->next;
+                tlck->next = tblk->next;
+                tblk->next = lid;
+                lid = next;
+        }
+        /*
+         * synchronously write the page, and
+         * hold the page for txUpdateMap();
+         */
+        for (lid = tblk->next; lid; lid = next) {
+                tlck = lid_to_tlock(lid);
+                next = tlck->next;
+                if ((mp = tlck->mp) != NULL &&
+                    (tlck->type & tlckBTROOT) == 0) {
+                        assert(mp->xflag & COMMIT_PAGE);
+                        if (tlck->flag & tlckWRITEPAGE) {
+                                tlck->flag &= ~tlckWRITEPAGE;
+                                /* do not release page to freelist */
+                                /*
+                                 * The "right" thing to do here is to
+                                 * synchronously write the metadata.
+                                 * With the current implementation this
+                                 * is hard since write_metapage requires
+                                 * us to kunmap & remap the page.  If we
+                                 * have tlocks pointing into the metadata
+                                 * pages, we don't want to do this.  I think
+                                 * we can get by with synchronously writing
+                                 * the pages when they are released.
+                                 */
+                                assert(atomic_read(&mp->nohomeok));
+                                set_bit(META_dirty, &mp->flag);
+                                set_bit(META_sync, &mp->flag);
+                        }
+                }
+        }
+}
+/*
+ *      txUpdateMap()
+ *
+ * function:    update persistent allocation map (and working map
+ *              if appropriate);
+ *
+ * parameter:
+ */
+static void txUpdateMap(struct tblock * tblk)
+{
+        struct inode *ip;
+        struct inode *ipimap;
+        lid_t lid;
+        struct tlock *tlck;
+        struct maplock *maplock;
+        struct pxd_lock pxdlock;
+        int maptype;
+        int k, nlock;
+        struct metapage *mp = NULL;
+        ipimap = JFS_SBI(tblk->sb)->ipimap;
+        maptype = (tblk->xflag & COMMIT_PMAP) ? COMMIT_PMAP : COMMIT_PWMAP;
+        /*
+         *      update block allocation map
+         *
+         * update allocation state in pmap (and wmap) and
+         * update lsn of the pmap page;
+         */
+        /*
+         * scan each tlock/page of transaction for block allocation/free:
+         *
+         * for each tlock/page of transaction, update map.
+         *  ? are there tlock for pmap and pwmap at the same time ?
+         */
+        for (lid = tblk->next; lid; lid = tlck->next) {
+                tlck = lid_to_tlock(lid);
+                if ((tlck->flag & tlckUPDATEMAP) == 0)
+                        continue;
+                if (tlck->flag & tlckFREEPAGE) {
+                        /*
+                         * Another thread may attempt to reuse freed space
+                         * immediately, so we want to get rid of the metapage
+                         * before anyone else has a chance to get it.
+                         * Lock metapage, update maps, then invalidate
+                         * the metapage.
+                         */
+                        mp = tlck->mp;
+                        ASSERT(mp->xflag & COMMIT_PAGE);
+                        hold_metapage(mp, 0);
+                }
+                /*
+                 * extent list:
+                 * . in-line PXD list:
+                 * . out-of-line XAD list:
+                 */
+                maplock = (struct maplock *) & tlck->lock;
+                nlock = maplock->index;
+                for (k = 0; k < nlock; k++, maplock++) {
+                        /*
+                         * allocate blocks in persistent map:
+                         *
+                         * blocks have been allocated from wmap at alloc time;
+                         */
+                        if (maplock->flag & mlckALLOC) {
+                                txAllocPMap(ipimap, maplock, tblk);
+                        }
+                        /*
+                         * free blocks in persistent and working map:
+                         * blocks will be freed in pmap and then in wmap;
+                         *
+                         * ? tblock specifies the PMAP/PWMAP based upon
+                         * transaction
+                         *
+                         * free blocks in persistent map:
+                         * blocks will be freed from wmap at last reference
+                         * release of the object for regular files;
+                         *
+                         * Alway free blocks from both persistent & working
+                         * maps for directories
+                         */
+                        else {  /* (maplock->flag & mlckFREE) */
+                                if (S_ISDIR(tlck->ip->i_mode))
+                                        txFreeMap(ipimap, maplock,
+                                                  tblk, COMMIT_PWMAP);
+                                else
+                                        txFreeMap(ipimap, maplock,
+                                                  tblk, maptype);
+                        }
+                }
+                if (tlck->flag & tlckFREEPAGE) {
+                        if (!(tblk->flag & tblkGC_LAZY)) {
+                                /* This is equivalent to txRelease */
+                                ASSERT(mp->lid == lid);
+                                tlck->mp->lid = 0;
+                        }
+                        assert(atomic_read(&mp->nohomeok) == 1);
+                        atomic_dec(&mp->nohomeok);
+                        discard_metapage(mp);
+                        tlck->mp = NULL;
+                }
+        }
+        /*
+         *      update inode allocation map
+         *
+         * update allocation state in pmap and
+         * update lsn of the pmap page;
+         * update in-memory inode flag/state
+         *
+         * unlock mapper/write lock
+         */
+        if (tblk->xflag & COMMIT_CREATE) {
+                diUpdatePMap(ipimap, tblk->ino, FALSE, tblk);
+                ipimap->i_state |= I_DIRTY;
+                /* update persistent block allocation map
+                 * for the allocation of inode extent;
+                 */
+                pxdlock.flag = mlckALLOCPXD;
+                pxdlock.pxd = tblk->u.ixpxd;
+                pxdlock.index = 1;
+                txAllocPMap(ipimap, (struct maplock *) & pxdlock, tblk);
+        } else if (tblk->xflag & COMMIT_DELETE) {
+                ip = tblk->u.ip;
+                diUpdatePMap(ipimap, ip->i_ino, TRUE, tblk);
+                ipimap->i_state |= I_DIRTY;
+                iput(ip);
+        }
+}
+/*
+ *      txAllocPMap()
+ *
+ * function: allocate from persistent map;
+ *
+ * parameter:
+ *      ipbmap  -
+ *      malock -
+ *              xad list:
+ *              pxd:
+ *
+ *      maptype -
+ *              allocate from persistent map;
+ *              free from persistent map;
+ *              (e.g., tmp file - free from working map at releae
+ *               of last reference);
+ *              free from persistent and working map;
+ *
+ *      lsn     - log sequence number;
+ */
+static void txAllocPMap(struct inode *ip, struct maplock * maplock,
+                        struct tblock * tblk)
+{
+        struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
+        struct xdlistlock *xadlistlock;
+        xad_t *xad;
+        s64 xaddr;
+        int xlen;
+        struct pxd_lock *pxdlock;
+        struct xdlistlock *pxdlistlock;
+        pxd_t *pxd;
+        int n;
+        /*
+         * allocate from persistent map;
+         */
+        if (maplock->flag & mlckALLOCXADLIST) {
+                xadlistlock = (struct xdlistlock *) maplock;
+                xad = xadlistlock->xdlist;
+                for (n = 0; n < xadlistlock->count; n++, xad++) {
+                        if (xad->flag & (XAD_NEW | XAD_EXTENDED)) {
+                                xaddr = addressXAD(xad);
+                                xlen = lengthXAD(xad);
+                                dbUpdatePMap(ipbmap, FALSE, xaddr,
+                                             (s64) xlen, tblk);
+                                xad->flag &= ~(XAD_NEW | XAD_EXTENDED);
+                                jfs_info("allocPMap: xaddr:0x%lx xlen:%d",
+                                         (ulong) xaddr, xlen);
+                        }
+                }
+        } else if (maplock->flag & mlckALLOCPXD) {
+                pxdlock = (struct pxd_lock *) maplock;
+                xaddr = addressPXD(&pxdlock->pxd);
+                xlen = lengthPXD(&pxdlock->pxd);
+                dbUpdatePMap(ipbmap, FALSE, xaddr, (s64) xlen, tblk);
+                jfs_info("allocPMap: xaddr:0x%lx xlen:%d", (ulong) xaddr, xlen);
+        } else {                /* (maplock->flag & mlckALLOCPXDLIST) */
+                pxdlistlock = (struct xdlistlock *) maplock;
+                pxd = pxdlistlock->xdlist;
+                for (n = 0; n < pxdlistlock->count; n++, pxd++) {
+                        xaddr = addressPXD(pxd);
+                        xlen = lengthPXD(pxd);
+                        dbUpdatePMap(ipbmap, FALSE, xaddr, (s64) xlen,
+                                     tblk);
+                        jfs_info("allocPMap: xaddr:0x%lx xlen:%d",
+                                 (ulong) xaddr, xlen);
+                }
+        }
+}
+/*
+ *      txFreeMap()
+ *
+ * function:    free from persistent and/or working map;
+ *
+ * todo: optimization
+ */
+void txFreeMap(struct inode *ip,
+               struct maplock * maplock, struct tblock * tblk, int maptype)
+{
+        struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
+        struct xdlistlock *xadlistlock;
+        xad_t *xad;
+        s64 xaddr;
+        int xlen;
+        struct pxd_lock *pxdlock;
+        struct xdlistlock *pxdlistlock;
+        pxd_t *pxd;
+        int n;
+        jfs_info("txFreeMap: tblk:0x%p maplock:0x%p maptype:0x%x",
+                 tblk, maplock, maptype);
+        /*
+         * free from persistent map;
+         */
+        if (maptype == COMMIT_PMAP || maptype == COMMIT_PWMAP) {
+                if (maplock->flag & mlckFREEXADLIST) {
+                        xadlistlock = (struct xdlistlock *) maplock;
+                        xad = xadlistlock->xdlist;
+                        for (n = 0; n < xadlistlock->count; n++, xad++) {
+                                if (!(xad->flag & XAD_NEW)) {
+                                        xaddr = addressXAD(xad);
+                                        xlen = lengthXAD(xad);
+                                        dbUpdatePMap(ipbmap, TRUE, xaddr,
+                                                     (s64) xlen, tblk);
+                                        jfs_info("freePMap: xaddr:0x%lx "
+                                                 "xlen:%d",
+                                                 (ulong) xaddr, xlen);
+                                }
+                        }
+                } else if (maplock->flag & mlckFREEPXD) {
+                        pxdlock = (struct pxd_lock *) maplock;
+                        xaddr = addressPXD(&pxdlock->pxd);
+                        xlen = lengthPXD(&pxdlock->pxd);
+                        dbUpdatePMap(ipbmap, TRUE, xaddr, (s64) xlen,
+                                     tblk);
+                        jfs_info("freePMap: xaddr:0x%lx xlen:%d",
+                                 (ulong) xaddr, xlen);
+                } else {        /* (maplock->flag & mlckALLOCPXDLIST) */
+                        pxdlistlock = (struct xdlistlock *) maplock;
+                        pxd = pxdlistlock->xdlist;
+                        for (n = 0; n < pxdlistlock->count; n++, pxd++) {
+                                xaddr = addressPXD(pxd);
+                                xlen = lengthPXD(pxd);
+                                dbUpdatePMap(ipbmap, TRUE, xaddr,
+                                             (s64) xlen, tblk);
+                                jfs_info("freePMap: xaddr:0x%lx xlen:%d",
+                                         (ulong) xaddr, xlen);
+                        }
+                }
+        }
+        /*
+         * free from working map;
+         */
+        if (maptype == COMMIT_PWMAP || maptype == COMMIT_WMAP) {
+                if (maplock->flag & mlckFREEXADLIST) {
+                        xadlistlock = (struct xdlistlock *) maplock;
+                        xad = xadlistlock->xdlist;
+                        for (n = 0; n < xadlistlock->count; n++, xad++) {
+                                xaddr = addressXAD(xad);
+                                xlen = lengthXAD(xad);
+                                dbFree(ip, xaddr, (s64) xlen);
+                                xad->flag = 0;
+                                jfs_info("freeWMap: xaddr:0x%lx xlen:%d",
+                                         (ulong) xaddr, xlen);
+                        }
+                } else if (maplock->flag & mlckFREEPXD) {
+                        pxdlock = (struct pxd_lock *) maplock;
+                        xaddr = addressPXD(&pxdlock->pxd);
+                        xlen = lengthPXD(&pxdlock->pxd);
+                        dbFree(ip, xaddr, (s64) xlen);
+                        jfs_info("freeWMap: xaddr:0x%lx xlen:%d",
+                                 (ulong) xaddr, xlen);
+                } else {        /* (maplock->flag & mlckFREEPXDLIST) */
+                        pxdlistlock = (struct xdlistlock *) maplock;
+                        pxd = pxdlistlock->xdlist;
+                        for (n = 0; n < pxdlistlock->count; n++, pxd++) {
+                                xaddr = addressPXD(pxd);
+                                xlen = lengthPXD(pxd);
+                                dbFree(ip, xaddr, (s64) xlen);
+                                jfs_info("freeWMap: xaddr:0x%lx xlen:%d",
+                                         (ulong) xaddr, xlen);
+                        }
+                }
+        }
+}
+/*
+ *      txFreelock()
+ *
+ * function:    remove tlock from inode anonymous locklist
+ */
+void txFreelock(struct inode *ip)
+{
+        struct jfs_inode_info *jfs_ip = JFS_IP(ip);
+        struct tlock *xtlck, *tlck;
+        lid_t xlid = 0, lid;
+        if (!jfs_ip->atlhead)
+                return;
+        TXN_LOCK();
+        xtlck = (struct tlock *) &jfs_ip->atlhead;
+        while ((lid = xtlck->next) != 0) {
+                tlck = lid_to_tlock(lid);
+                if (tlck->flag & tlckFREELOCK) {
+                        xtlck->next = tlck->next;
+                        txLockFree(lid);
+                } else {
+                        xtlck = tlck;
+                        xlid = lid;
+                }
+        }
+        if (jfs_ip->atlhead)
+                jfs_ip->atltail = xlid;
+        else {
+                jfs_ip->atltail = 0;
+                /*
+                 * If inode was on anon_list, remove it
+                 */
+                list_del_init(&jfs_ip->anon_inode_list);
+        }
+        TXN_UNLOCK();
+}
+/*
+ *      txAbort()
+ *
+ * function: abort tx before commit;
+ *
+ * frees line-locks and segment locks for all
+ * segments in comdata structure.
+ * Optionally sets state of file-system to FM_DIRTY in super-block.
+ * log age of page-frames in memory for which caller has
+ * are reset to 0 (to avoid logwarap).
+ */
+void txAbort(tid_t tid, int dirty)
+{
+        lid_t lid, next;
+        struct metapage *mp;
+        struct tblock *tblk = tid_to_tblock(tid);
+        struct tlock *tlck;
+        /*
+         * free tlocks of the transaction
+         */
+        for (lid = tblk->next; lid; lid = next) {
+                tlck = lid_to_tlock(lid);
+                next = tlck->next;
+                mp = tlck->mp;
+                JFS_IP(tlck->ip)->xtlid = 0;
+                if (mp) {
+                        mp->lid = 0;
+                        /*
+                         * reset lsn of page to avoid logwarap:
+                         *
+                         * (page may have been previously committed by another
+                         * transaction(s) but has not been paged, i.e.,
+                         * it may be on logsync list even though it has not
+                         * been logged for the current tx.)
+                         */
+                        if (mp->xflag & COMMIT_PAGE && mp->lsn)
+                                LogSyncRelease(mp);
+                }
+                /* insert tlock at head of freelist */
+                TXN_LOCK();
+                txLockFree(lid);
+                TXN_UNLOCK();
+        }
+        /* caller will free the transaction block */
+        tblk->next = tblk->last = 0;
+        /*
+         * mark filesystem dirty
+         */
+        if (dirty)
+                jfs_error(tblk->sb, "txAbort");
+        return;
+}
+/*
+ *      txLazyCommit(void)
+ *
+ *      All transactions except those changing ipimap (COMMIT_FORCE) are
+ *      processed by this routine.  This insures that the inode and block
+ *      allocation maps are updated in order.  For synchronous transactions,
+ *      let the user thread finish processing after txUpdateMap() is called.
+ */
+static void txLazyCommit(struct tblock * tblk)
+{
+        struct jfs_log *log;
+        while (((tblk->flag & tblkGC_READY) == 0) &&
+               ((tblk->flag & tblkGC_UNLOCKED) == 0)) {
+                /* We must have gotten ahead of the user thread
+                 */
+                jfs_info("jfs_lazycommit: tblk 0x%p not unlocked", tblk);
+                yield();
+        }
+        jfs_info("txLazyCommit: processing tblk 0x%p", tblk);
+        txUpdateMap(tblk);
+        log = (struct jfs_log *) JFS_SBI(tblk->sb)->log;
+        spin_lock_irq(&log->gclock);    // LOGGC_LOCK
+        tblk->flag |= tblkGC_COMMITTED;
+        if (tblk->flag & tblkGC_READY)
+                log->gcrtc--;
+        wake_up_all(&tblk->gcwait);     // LOGGC_WAKEUP
+        /*
+         * Can't release log->gclock until we've tested tblk->flag
+         */
+        if (tblk->flag & tblkGC_LAZY) {
+                spin_unlock_irq(&log->gclock);  // LOGGC_UNLOCK
+                txUnlock(tblk);
+                tblk->flag &= ~tblkGC_LAZY;
+                txEnd(tblk - TxBlock);  /* Convert back to tid */
+        } else
+                spin_unlock_irq(&log->gclock);  // LOGGC_UNLOCK
+        jfs_info("txLazyCommit: done: tblk = 0x%p", tblk);
+}
+/*
+ *      jfs_lazycommit(void)
+ *
+ *      To be run as a kernel daemon.  If lbmIODone is called in an interrupt
+ *      context, or where blocking is not wanted, this routine will process
+ *      committed transactions from the unlock queue.
+ */
+int jfs_lazycommit(void *arg)
+{
+        int WorkDone;
+        struct tblock *tblk;
+        unsigned long flags;
+        struct jfs_sb_info *sbi;
+        daemonize("jfsCommit");
+        complete(&jfsIOwait);
+        do {
+                LAZY_LOCK(flags);
+                jfs_commit_thread_waking = 0;   /* OK to wake another thread */
+                while (!list_empty(&TxAnchor.unlock_queue)) {
+                        WorkDone = 0;
+                        list_for_each_entry(tblk, &TxAnchor.unlock_queue,
+                                            cqueue) {
+                                sbi = JFS_SBI(tblk->sb);
+                                /*
+                                 * For each volume, the transactions must be
+                                 * handled in order.  If another commit thread
+                                 * is handling a tblk for this superblock,
+                                 * skip it
+                                 */
+                                if (sbi->commit_state & IN_LAZYCOMMIT)
+                                        continue;
+                                sbi->commit_state |= IN_LAZYCOMMIT;
+                                WorkDone = 1;
+                                /*
+                                 * Remove transaction from queue
+                                 */
+                                list_del(&tblk->cqueue);
+                                LAZY_UNLOCK(flags);
+                                txLazyCommit(tblk);
+                                LAZY_LOCK(flags);
+                                sbi->commit_state &= ~IN_LAZYCOMMIT;
+                                /*
+                                 * Don't continue in the for loop.  (We can't
+                                 * anyway, it's unsafe!)  We want to go back to
+                                 * the beginning of the list.
+                                 */
+                                break;
+                        }
+                        /* If there was nothing to do, don't continue */
+                        if (!WorkDone)
+                                break;
+                }
+                /* In case a wakeup came while all threads were active */
+                jfs_commit_thread_waking = 0;
+                if (current->flags & PF_FREEZE) {
+                        LAZY_UNLOCK(flags);
+                        refrigerator(PF_FREEZE);
+                } else {
+                        DECLARE_WAITQUEUE(wq, current);
+                        add_wait_queue(&jfs_commit_thread_wait, &wq);
+                        set_current_state(TASK_INTERRUPTIBLE);
+                        LAZY_UNLOCK(flags);
+                        schedule();
+                        current->state = TASK_RUNNING;
+                        remove_wait_queue(&jfs_commit_thread_wait, &wq);
+                }
+        } while (!jfs_stop_threads);
+        if (!list_empty(&TxAnchor.unlock_queue))
+                jfs_err("jfs_lazycommit being killed w/pending transactions!");
+        else
+                jfs_info("jfs_lazycommit being killed\n");
+        complete_and_exit(&jfsIOwait, 0);
+}
+void txLazyUnlock(struct tblock * tblk)
+{
+        unsigned long flags;
+        LAZY_LOCK(flags);
+        list_add_tail(&tblk->cqueue, &TxAnchor.unlock_queue);
+        /*
+         * Don't wake up a commit thread if there is already one servicing
+         * this superblock, or if the last one we woke up hasn't started yet.
+         */
+        if (!(JFS_SBI(tblk->sb)->commit_state & IN_LAZYCOMMIT) &&
+            !jfs_commit_thread_waking) {
+                jfs_commit_thread_waking = 1;
+                wake_up(&jfs_commit_thread_wait);
+        }
+        LAZY_UNLOCK(flags);
+}
+static void LogSyncRelease(struct metapage * mp)
+{
+        struct jfs_log *log = mp->log;
+        assert(atomic_read(&mp->nohomeok));
+        assert(log);
+        atomic_dec(&mp->nohomeok);
+        if (atomic_read(&mp->nohomeok))
+                return;
+        hold_metapage(mp, 0);
+        LOGSYNC_LOCK(log);
+        mp->log = NULL;
+        mp->lsn = 0;
+        mp->clsn = 0;
+        log->count--;
+        list_del_init(&mp->synclist);
+        LOGSYNC_UNLOCK(log);
+        release_metapage(mp);
+}
+/*
+ *      txQuiesce
+ *
+ *      Block all new transactions and push anonymous transactions to
+ *      completion
+ *
+ *      This does almost the same thing as jfs_sync below.  We don't
+ *      worry about deadlocking when jfs_tlocks_low is set, since we would
+ *      expect jfs_sync to get us out of that jam.
+ */
+void txQuiesce(struct super_block *sb)
+{
+        struct inode *ip;
+        struct jfs_inode_info *jfs_ip;
+        struct jfs_log *log = JFS_SBI(sb)->log;
+        tid_t tid;
+        set_bit(log_QUIESCE, &log->flag);
+        TXN_LOCK();
+restart:
+        while (!list_empty(&TxAnchor.anon_list)) {
+                jfs_ip = list_entry(TxAnchor.anon_list.next,
+                                    struct jfs_inode_info,
+                                    anon_inode_list);
+                ip = &jfs_ip->vfs_inode;
+                /*
+                 * inode will be removed from anonymous list
+                 * when it is committed
+                 */
+                TXN_UNLOCK();
+                tid = txBegin(ip->i_sb, COMMIT_INODE | COMMIT_FORCE);
+                down(&jfs_ip->commit_sem);
+                txCommit(tid, 1, &ip, 0);
+                txEnd(tid);
+                up(&jfs_ip->commit_sem);
+                /*
+                 * Just to be safe.  I don't know how
+                 * long we can run without blocking
+                 */
+                cond_resched();
+                TXN_LOCK();
+        }
+        /*
+         * If jfs_sync is running in parallel, there could be some inodes
+         * on anon_list2.  Let's check.
+         */
+        if (!list_empty(&TxAnchor.anon_list2)) {
+                list_splice(&TxAnchor.anon_list2, &TxAnchor.anon_list);
+                INIT_LIST_HEAD(&TxAnchor.anon_list2);
+                goto restart;
+        }
+        TXN_UNLOCK();
+        /*
+         * We may need to kick off the group commit
+         */
+        jfs_flush_journal(log, 0);
+}
+/*
+ * txResume()
+ *
+ * Allows transactions to start again following txQuiesce
+ */
+void txResume(struct super_block *sb)
+{
+        struct jfs_log *log = JFS_SBI(sb)->log;
+        clear_bit(log_QUIESCE, &log->flag);
+        TXN_WAKEUP(&log->syncwait);
+}
+/*
+ *      jfs_sync(void)
+ *
+ *      To be run as a kernel daemon.  This is awakened when tlocks run low.
+ *      We write any inodes that have anonymous tlocks so they will become
+ *      available.
+ */
+int jfs_sync(void *arg)
+{
+        struct inode *ip;
+        struct jfs_inode_info *jfs_ip;
+        int rc;
+        tid_t tid;
+        daemonize("jfsSync");
+        complete(&jfsIOwait);
+        do {
+                /*
+                 * write each inode on the anonymous inode list
+                 */
+                TXN_LOCK();
+                while (jfs_tlocks_low && !list_empty(&TxAnchor.anon_list)) {
+                        jfs_ip = list_entry(TxAnchor.anon_list.next,
+                                            struct jfs_inode_info,
+                                            anon_inode_list);
+                        ip = &jfs_ip->vfs_inode;
+                        if (! igrab(ip)) {
+                                /*
+                                 * Inode is being freed
+                                 */
+                                list_del_init(&jfs_ip->anon_inode_list);
+                        } else if (! down_trylock(&jfs_ip->commit_sem)) {
+                                /*
+                                 * inode will be removed from anonymous list
+                                 * when it is committed
+                                 */
+                                TXN_UNLOCK();
+                                tid = txBegin(ip->i_sb, COMMIT_INODE);
+                                rc = txCommit(tid, 1, &ip, 0);
+                                txEnd(tid);
+                                up(&jfs_ip->commit_sem);
+                                iput(ip);
+                                /*
+                                 * Just to be safe.  I don't know how
+                                 * long we can run without blocking
+                                 */
+                                cond_resched();
+                                TXN_LOCK();
+                        } else {
+                                /* We can't get the commit semaphore.  It may
+                                 * be held by a thread waiting for tlock's
+                                 * so let's not block here.  Save it to
+                                 * put back on the anon_list.
+                                 */
+                                /* Take off anon_list */
+                                list_del(&jfs_ip->anon_inode_list);
+                                /* Put on anon_list2 */
+                                list_add(&jfs_ip->anon_inode_list,
+                                         &TxAnchor.anon_list2);
+                                TXN_UNLOCK();
+                                iput(ip);
+                                TXN_LOCK();
+                        }
+                }
+                /* Add anon_list2 back to anon_list */
+                list_splice_init(&TxAnchor.anon_list2, &TxAnchor.anon_list);
+                if (current->flags & PF_FREEZE) {
+                        TXN_UNLOCK();
+                        refrigerator(PF_FREEZE);
+                } else {
+                        DECLARE_WAITQUEUE(wq, current);
+                        add_wait_queue(&jfs_sync_thread_wait, &wq);
+                        set_current_state(TASK_INTERRUPTIBLE);
+                        TXN_UNLOCK();
+                        schedule();
+                        current->state = TASK_RUNNING;
+                        remove_wait_queue(&jfs_sync_thread_wait, &wq);
+                }
+        } while (!jfs_stop_threads);
+        jfs_info("jfs_sync being killed");
+        complete_and_exit(&jfsIOwait, 0);
+}
+#if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_DEBUG)
+int jfs_txanchor_read(char *buffer, char **start, off_t offset, int length,
+                      int *eof, void *data)
+{
+        int len = 0;
+        off_t begin;
+        char *freewait;
+        char *freelockwait;
+        char *lowlockwait;
+        freewait =
+            waitqueue_active(&TxAnchor.freewait) ? "active" : "empty";
+        freelockwait =
+            waitqueue_active(&TxAnchor.freelockwait) ? "active" : "empty";
+        lowlockwait =
+            waitqueue_active(&TxAnchor.lowlockwait) ? "active" : "empty";
+        len += sprintf(buffer,
+                       "JFS TxAnchor\n"
+                       "============\n"
+                       "freetid = %d\n"
+                       "freewait = %s\n"
+                       "freelock = %d\n"
+                       "freelockwait = %s\n"
+                       "lowlockwait = %s\n"
+                       "tlocksInUse = %d\n"
+                       "jfs_tlocks_low = %d\n"
+                       "unlock_queue is %sempty\n",
+                       TxAnchor.freetid,
+                       freewait,
+                       TxAnchor.freelock,
+                       freelockwait,
+                       lowlockwait,
+                       TxAnchor.tlocksInUse,
+                       jfs_tlocks_low,
+                       list_empty(&TxAnchor.unlock_queue) ? "" : "not ");
+        begin = offset;
+        *start = buffer + begin;
+        len -= begin;
+        if (len > length)
+                len = length;
+        else
+                *eof = 1;
+        if (len < 0)
+                len = 0;
+        return len;
+}
+#endif
+#if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_STATISTICS)
+int jfs_txstats_read(char *buffer, char **start, off_t offset, int length,
+                     int *eof, void *data)
+{
+        int len = 0;
+        off_t begin;
+        len += sprintf(buffer,
+                       "JFS TxStats\n"
+                       "===========\n"
+                       "calls to txBegin = %d\n"
+                       "txBegin blocked by sync barrier = %d\n"
+                       "txBegin blocked by tlocks low = %d\n"
+                       "txBegin blocked by no free tid = %d\n"
+                       "calls to txBeginAnon = %d\n"
+                       "txBeginAnon blocked by sync barrier = %d\n"
+                       "txBeginAnon blocked by tlocks low = %d\n"
+                       "calls to txLockAlloc = %d\n"
+                       "tLockAlloc blocked by no free lock = %d\n",
+                       TxStat.txBegin,
+                       TxStat.txBegin_barrier,
+                       TxStat.txBegin_lockslow,
+                       TxStat.txBegin_freetid,
+                       TxStat.txBeginAnon,
+                       TxStat.txBeginAnon_barrier,
+                       TxStat.txBeginAnon_lockslow,
+                       TxStat.txLockAlloc,
+                       TxStat.txLockAlloc_freelock);
+        begin = offset;
+        *start = buffer + begin;
+        len -= begin;
+        if (len > length)
+                len = length;
+        else
+                *eof = 1;
+        if (len < 0)
+                len = 0;
+        return len;
+}
+#endif
diff --git a/fs/jfs/jfs_txnmgr.h b/fs/jfs/jfs_txnmgr.h
new file mode 100644
index 000000000000..b71b82c2df04
--- /dev/null
+++ b/fs/jfs/jfs_txnmgr.h
@@ -0,0 +1,318 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_TXNMGR
+#define _H_JFS_TXNMGR
+#include "jfs_logmgr.h"
+/*
+ * Hide implementation of TxBlock and TxLock
+ */
+#define tid_to_tblock(tid) (&TxBlock[tid])
+#define lid_to_tlock(lid) (&TxLock[lid])
+/*
+ *      transaction block
+ */
+struct tblock {
+        /*
+         * tblock and jbuf_t common area: struct logsyncblk
+         *
+         * the following 5 fields are the same as struct logsyncblk
+         * which is common to tblock and jbuf to form logsynclist
+         */
+        u16 xflag;              /* tx commit type */
+        u16 flag;               /* tx commit state */
+        lid_t dummy;            /* Must keep structures common */
+        s32 lsn;                /* recovery lsn */
+        struct list_head synclist;      /* logsynclist link */
+        /* lock management */
+        struct super_block *sb; /* super block */
+        lid_t next;             /* index of first tlock of tid */
+        lid_t last;             /* index of last tlock of tid */
+        wait_queue_head_t waitor;       /* tids waiting on this tid */
+        /* log management */
+        u32 logtid;             /* log transaction id */
+        /* commit management */
+        struct list_head cqueue;        /* commit queue list */
+        s32 clsn;               /* commit lsn */
+        struct lbuf *bp;
+        s32 pn;                 /* commit record log page number */
+        s32 eor;                /* commit record eor */
+        wait_queue_head_t gcwait;       /* group commit event list:
+                                         * ready transactions wait on this
+                                         * event for group commit completion.
+                                         */
+        union {
+                struct inode *ip; /* inode being deleted */
+                pxd_t ixpxd;    /* pxd of inode extent for created inode */
+        } u;
+        u32 ino;                /* inode number being created */
+};
+extern struct tblock *TxBlock;  /* transaction block table */
+/* commit flags: tblk->xflag */
+#define COMMIT_SYNC     0x0001  /* synchronous commit */
+#define COMMIT_FORCE    0x0002  /* force pageout at end of commit */
+#define COMMIT_FLUSH    0x0004  /* init flush at end of commit */
+#define COMMIT_MAP      0x00f0
+#define COMMIT_PMAP     0x0010  /* update pmap */
+#define COMMIT_WMAP     0x0020  /* update wmap */
+#define COMMIT_PWMAP    0x0040  /* update pwmap */
+#define COMMIT_FREE     0x0f00
+#define COMMIT_DELETE   0x0100  /* inode delete */
+#define COMMIT_TRUNCATE 0x0200  /* file truncation */
+#define COMMIT_CREATE   0x0400  /* inode create */
+#define COMMIT_LAZY     0x0800  /* lazy commit */
+#define COMMIT_PAGE     0x1000  /* Identifies element as metapage */
+#define COMMIT_INODE    0x2000  /* Identifies element as inode */
+/* group commit flags tblk->flag: see jfs_logmgr.h */
+/*
+ *      transaction lock
+ */
+struct tlock {
+        lid_t next;             /* 2: index next lockword on tid locklist
+                                 *          next lockword on freelist
+                                 */
+        tid_t tid;              /* 2: transaction id holding lock */
+        u16 flag;               /* 2: lock control */
+        u16 type;               /* 2: log type */
+        struct metapage *mp;    /* 4/8: object page buffer locked */
+        struct inode *ip;       /* 4/8: object */
+        /* (16) */
+        s16 lock[24];           /* 48: overlay area */
+};                              /* (64) */
+extern struct tlock *TxLock;    /* transaction lock table */
+/*
+ * tlock flag
+ */
+/* txLock state */
+#define tlckPAGELOCK            0x8000
+#define tlckINODELOCK           0x4000
+#define tlckLINELOCK            0x2000
+#define tlckINLINELOCK          0x1000
+/* lmLog state */
+#define tlckLOG                 0x0800
+/* updateMap state */
+#define tlckUPDATEMAP           0x0080
+/* freeLock state */
+#define tlckFREELOCK            0x0008
+#define tlckWRITEPAGE           0x0004
+#define tlckFREEPAGE            0x0002
+/*
+ * tlock type
+ */
+#define tlckTYPE                0xfe00
+#define tlckINODE               0x8000
+#define tlckXTREE               0x4000
+#define tlckDTREE               0x2000
+#define tlckMAP                 0x1000
+#define tlckEA                  0x0800
+#define tlckACL                 0x0400
+#define tlckDATA                0x0200
+#define tlckBTROOT              0x0100
+#define tlckOPERATION           0x00ff
+#define tlckGROW                0x0001  /* file grow */
+#define tlckREMOVE              0x0002  /* file delete */
+#define tlckTRUNCATE            0x0004  /* file truncate */
+#define tlckRELOCATE            0x0008  /* file/directory relocate */
+#define tlckENTRY               0x0001  /* directory insert/delete */
+#define tlckEXTEND              0x0002  /* directory extend in-line */
+#define tlckSPLIT               0x0010  /* splited page */
+#define tlckNEW                 0x0020  /* new page from split */
+#define tlckFREE                0x0040  /* free page */
+#define tlckRELINK              0x0080  /* update sibling pointer */
+/*
+ *      linelock for lmLog()
+ *
+ * note: linelock and its variations are overlaid
+ * at tlock.lock: watch for alignment;
+ */
+struct lv {
+        u8 offset;              /* 1: */
+        u8 length;              /* 1: */
+};                              /* (2) */
+#define TLOCKSHORT      20
+#define TLOCKLONG       28
+struct linelock {
+        lid_t next;             /* 2: next linelock */
+        s8 maxcnt;              /* 1: */
+        s8 index;               /* 1: */
+        u16 flag;               /* 2: */
+        u8 type;                /* 1: */
+        u8 l2linesize;          /* 1: log2 of linesize */
+        /* (8) */
+        struct lv lv[20];       /* 40: */
+};                              /* (48) */
+#define dt_lock linelock
+struct xtlock {
+        lid_t next;             /* 2: */
+        s8 maxcnt;              /* 1: */
+        s8 index;               /* 1: */
+        u16 flag;               /* 2: */
+        u8 type;                /* 1: */
+        u8 l2linesize;          /* 1: log2 of linesize */
+                                /* (8) */
+        struct lv header;       /* 2: */
+        struct lv lwm;          /* 2: low water mark */
+        struct lv hwm;          /* 2: high water mark */
+        struct lv twm;          /* 2: */
+                                /* (16) */
+        s32 pxdlock[8];         /* 32: */
+};                              /* (48) */
+/*
+ *      maplock for txUpdateMap()
+ *
+ * note: maplock and its variations are overlaid
+ * at tlock.lock/linelock: watch for alignment;
+ * N.B. next field may be set by linelock, and should not
+ * be modified by maplock;
+ * N.B. index of the first pxdlock specifies index of next 
+ * free maplock (i.e., number of maplock) in the tlock; 
+ */
+struct maplock {
+        lid_t next;             /* 2: */
+        u8 maxcnt;              /* 2: */
+        u8 index;               /* 2: next free maplock index */
+        u16 flag;               /* 2: */
+        u8 type;                /* 1: */
+        u8 count;               /* 1: number of pxd/xad */
+                                /* (8) */
+        pxd_t pxd;              /* 8: */
+};                              /* (16): */
+/* maplock flag */
+#define mlckALLOC               0x00f0
+#define mlckALLOCXADLIST        0x0080
+#define mlckALLOCPXDLIST        0x0040
+#define mlckALLOCXAD            0x0020
+#define mlckALLOCPXD            0x0010
+#define mlckFREE                0x000f
+#define mlckFREEXADLIST         0x0008
+#define mlckFREEPXDLIST         0x0004
+#define mlckFREEXAD             0x0002
+#define mlckFREEPXD             0x0001
+#define pxd_lock        maplock
+struct xdlistlock {
+        lid_t next;             /* 2: */
+        u8 maxcnt;              /* 2: */
+        u8 index;               /* 2: */
+        u16 flag;               /* 2: */
+        u8 type;                /* 1: */
+        u8 count;               /* 1: number of pxd/xad */
+                                /* (8) */
+        /*
+         * We need xdlist to be 64 bits (8 bytes), regardless of
+         * whether void * is 32 or 64 bits
+         */
+        union {
+                void *_xdlist;  /* pxd/xad list */
+                s64 pad;        /* 8: Force 64-bit xdlist size */
+        } union64;
+};                              /* (16): */
+#define xdlist union64._xdlist
+/*
+ *      commit
+ *
+ * parameter to the commit manager routines
+ */
+struct commit {
+        tid_t tid;              /* tid = index of tblock */
+        int flag;               /* flags */
+        struct jfs_log *log;    /* log */
+        struct super_block *sb; /* superblock */
+        int nip;                /* number of entries in iplist */
+        struct inode **iplist;  /* list of pointers to inodes */
+        /* log record descriptor on 64-bit boundary */
+        struct lrd lrd;         /* : log record descriptor */
+};
+/*
+ * external declarations
+ */
+extern struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage *mp,
+                            int flag);
+extern struct tlock *txMaplock(tid_t tid, struct inode *ip, int flag);
+extern int txCommit(tid_t tid, int nip, struct inode **iplist, int flag);
+extern tid_t txBegin(struct super_block *sb, int flag);
+extern void txBeginAnon(struct super_block *sb);
+extern void txEnd(tid_t tid);
+extern void txAbort(tid_t tid, int dirty);
+extern struct linelock *txLinelock(struct linelock * tlock);
+extern void txFreeMap(struct inode *ip, struct maplock * maplock,
+                      struct tblock * tblk, int maptype);
+extern void txEA(tid_t tid, struct inode *ip, dxd_t * oldea, dxd_t * newea);
+extern void txFreelock(struct inode *ip);
+extern int lmLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
+                 struct tlock * tlck);
+extern void txQuiesce(struct super_block *sb);
+extern void txResume(struct super_block *sb);
+#endif                          /* _H_JFS_TXNMGR */
diff --git a/fs/jfs/jfs_types.h b/fs/jfs/jfs_types.h
new file mode 100644
index 000000000000..5bfad39a2078
--- /dev/null
+++ b/fs/jfs/jfs_types.h
@@ -0,0 +1,192 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_TYPES
+#define _H_JFS_TYPES
+/*
+ *      jfs_types.h:
+ *
+ * basic type/utility  definitions
+ *
+ * note: this header file must be the 1st include file
+ * of JFS include list in all JFS .c file.
+ */
+#include <linux/types.h>
+#include <linux/nls.h>
+#include "endian24.h"
+/*
+ * transaction and lock id's
+ *
+ * Don't change these without carefully considering the impact on the
+ * size and alignment of all of the linelock variants
+ */
+typedef u16 tid_t;
+typedef u16 lid_t;
+/*
+ * Almost identical to Linux's timespec, but not quite
+ */
+struct timestruc_t {
+        __le32 tv_sec;
+        __le32 tv_nsec;
+};
+/*
+ *      handy
+ */
+#define LEFTMOSTONE     0x80000000
+#define HIGHORDER       0x80000000u     /* high order bit on            */
+#define ONES            0xffffffffu     /* all bit on                   */
+typedef int boolean_t;
+#define TRUE 1
+#define FALSE 0
+/*
+ *      logical xd (lxd)
+ */
+typedef struct {
+        unsigned len:24;
+        unsigned off1:8;
+        u32 off2;
+} lxd_t;
+/* lxd_t field construction */
+#define LXDlength(lxd, length32)        ( (lxd)->len = length32 )
+#define LXDoffset(lxd, offset64)\
+{\
+        (lxd)->off1 = ((s64)offset64) >> 32;\
+        (lxd)->off2 = (offset64) & 0xffffffff;\
+}
+/* lxd_t field extraction */
+#define lengthLXD(lxd)  ( (lxd)->len )
+#define offsetLXD(lxd)\
+        ( ((s64)((lxd)->off1)) << 32 | (lxd)->off2 )
+/* lxd list */
+struct lxdlist {
+        s16 maxnlxd;
+        s16 nlxd;
+        lxd_t *lxd;
+};
+/*
+ *      physical xd (pxd)
+ */
+typedef struct {
+        unsigned len:24;
+        unsigned addr1:8;
+        __le32 addr2;
+} pxd_t;
+/* xd_t field construction */
+#define PXDlength(pxd, length32)        ((pxd)->len = __cpu_to_le24(length32))
+#define PXDaddress(pxd, address64)\
+{\
+        (pxd)->addr1 = ((s64)address64) >> 32;\
+        (pxd)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\
+}
+/* xd_t field extraction */
+#define lengthPXD(pxd)  __le24_to_cpu((pxd)->len)
+#define addressPXD(pxd)\
+        ( ((s64)((pxd)->addr1)) << 32 | __le32_to_cpu((pxd)->addr2))
+#define MAXTREEHEIGHT 8
+/* pxd list */
+struct pxdlist {
+        s16 maxnpxd;
+        s16 npxd;
+        pxd_t pxd[MAXTREEHEIGHT];
+};
+/*
+ *      data extent descriptor (dxd)
+ */
+typedef struct {
+        unsigned flag:8;        /* 1: flags */
+        unsigned rsrvd:24;
+        __le32 size;            /* 4: size in byte */
+        unsigned len:24;        /* 3: length in unit of fsblksize */
+        unsigned addr1:8;       /* 1: address in unit of fsblksize */
+        __le32 addr2;           /* 4: address in unit of fsblksize */
+} dxd_t;                        /* - 16 - */
+/* dxd_t flags */
+#define DXD_INDEX       0x80    /* B+-tree index */
+#define DXD_INLINE      0x40    /* in-line data extent */
+#define DXD_EXTENT      0x20    /* out-of-line single extent */
+#define DXD_FILE        0x10    /* out-of-line file (inode) */
+#define DXD_CORRUPT     0x08    /* Inconsistency detected */
+/* dxd_t field construction
+ *      Conveniently, the PXD macros work for DXD
+ */
+#define DXDlength       PXDlength
+#define DXDaddress      PXDaddress
+#define lengthDXD       lengthPXD
+#define addressDXD      addressPXD
+#define DXDsize(dxd, size32) ((dxd)->size = cpu_to_le32(size32))
+#define sizeDXD(dxd)    le32_to_cpu((dxd)->size)
+/*
+ *      directory entry argument
+ */
+struct component_name {
+        int namlen;
+        wchar_t *name;
+};
+/*
+ *      DASD limit information - stored in directory inode
+ */
+struct dasd {
+        u8 thresh;              /* Alert Threshold (in percent) */
+        u8 delta;               /* Alert Threshold delta (in percent)   */
+        u8 rsrvd1;
+        u8 limit_hi;            /* DASD limit (in logical blocks)       */
+        __le32 limit_lo;        /* DASD limit (in logical blocks)       */
+        u8 rsrvd2[3];
+        u8 used_hi;             /* DASD usage (in logical blocks)       */
+        __le32 used_lo;         /* DASD usage (in logical blocks)       */
+};
+#define DASDLIMIT(dasdp) \
+        (((u64)((dasdp)->limit_hi) << 32) + __le32_to_cpu((dasdp)->limit_lo))
+#define setDASDLIMIT(dasdp, limit)\
+{\
+        (dasdp)->limit_hi = ((u64)limit) >> 32;\
+        (dasdp)->limit_lo = __cpu_to_le32(limit);\
+}
+#define DASDUSED(dasdp) \
+        (((u64)((dasdp)->used_hi) << 32) + __le32_to_cpu((dasdp)->used_lo))
+#define setDASDUSED(dasdp, used)\
+{\
+        (dasdp)->used_hi = ((u64)used) >> 32;\
+        (dasdp)->used_lo = __cpu_to_le32(used);\
+}
+#endif                          /* !_H_JFS_TYPES */
diff --git a/fs/jfs/jfs_umount.c b/fs/jfs/jfs_umount.c
new file mode 100644
index 000000000000..f31a9e3f3fec
--- /dev/null
+++ b/fs/jfs/jfs_umount.c
@@ -0,0 +1,178 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+/*
+ *      jfs_umount.c
+ *
+ * note: file system in transition to aggregate/fileset:
+ * (ref. jfs_mount.c)
+ *
+ * file system unmount is interpreted as mount of the single/only 
+ * fileset in the aggregate and, if unmount of the last fileset, 
+ * as unmount of the aggerate;
+ */
+#include <linux/fs.h>
+#include "jfs_incore.h"
+#include "jfs_filsys.h"
+#include "jfs_superblock.h"
+#include "jfs_dmap.h"
+#include "jfs_imap.h"
+#include "jfs_metapage.h"
+#include "jfs_debug.h"
+/*
+ * NAME:        jfs_umount(vfsp, flags, crp)
+ *
+ * FUNCTION:    vfs_umount()
+ *
+ * PARAMETERS:  vfsp    - virtual file system pointer
+ *              flags   - unmount for shutdown
+ *              crp     - credential
+ *
+ * RETURN :     EBUSY   - device has open files
+ */
+int jfs_umount(struct super_block *sb)
+{
+        struct address_space *bdev_mapping = sb->s_bdev->bd_inode->i_mapping;
+        struct jfs_sb_info *sbi = JFS_SBI(sb);
+        struct inode *ipbmap = sbi->ipbmap;
+        struct inode *ipimap = sbi->ipimap;
+        struct inode *ipaimap = sbi->ipaimap;
+        struct inode *ipaimap2 = sbi->ipaimap2;
+        struct jfs_log *log;
+        int rc = 0;
+        jfs_info("UnMount JFS: sb:0x%p", sb);
+        /*
+         *      update superblock and close log 
+         *
+         * if mounted read-write and log based recovery was enabled
+         */
+        if ((log = sbi->log))
+                /*
+                 * Wait for outstanding transactions to be written to log: 
+                 */
+                jfs_flush_journal(log, 2);
+        /*
+         * close fileset inode allocation map (aka fileset inode)
+         */
+        diUnmount(ipimap, 0);
+        diFreeSpecial(ipimap);
+        sbi->ipimap = NULL;
+        /*
+         * close secondary aggregate inode allocation map
+         */
+        ipaimap2 = sbi->ipaimap2;
+        if (ipaimap2) {
+                diUnmount(ipaimap2, 0);
+                diFreeSpecial(ipaimap2);
+                sbi->ipaimap2 = NULL;
+        }
+        /*
+         * close aggregate inode allocation map
+         */
+        ipaimap = sbi->ipaimap;
+        diUnmount(ipaimap, 0);
+        diFreeSpecial(ipaimap);
+        sbi->ipaimap = NULL;
+        /*
+         * close aggregate block allocation map
+         */
+        dbUnmount(ipbmap, 0);
+        diFreeSpecial(ipbmap);
+        sbi->ipimap = NULL;
+        /*
+         * Make sure all metadata makes it to disk before we mark
+         * the superblock as clean
+         */
+        filemap_fdatawrite(bdev_mapping);
+        filemap_fdatawait(bdev_mapping);
+        /*
+         * ensure all file system file pages are propagated to their
+         * home blocks on disk (and their in-memory buffer pages are 
+         * invalidated) BEFORE updating file system superblock state
+         * (to signify file system is unmounted cleanly, and thus in 
+         * consistent state) and log superblock active file system 
+         * list (to signify skip logredo()).
+         */
+        if (log) {              /* log = NULL if read-only mount */
+                updateSuper(sb, FM_CLEAN);
+                /* Restore default gfp_mask for bdev */
+                mapping_set_gfp_mask(bdev_mapping, GFP_USER);
+                /*
+                 * close log: 
+                 *
+                 * remove file system from log active file system list.
+                 */
+                rc = lmLogClose(sb);
+        }
+        jfs_info("UnMount JFS Complete: rc = %d", rc);
+        return rc;
+}
+int jfs_umount_rw(struct super_block *sb)
+{
+        struct address_space *bdev_mapping = sb->s_bdev->bd_inode->i_mapping;
+        struct jfs_sb_info *sbi = JFS_SBI(sb);
+        struct jfs_log *log = sbi->log;
+        if (!log)
+                return 0;
+        /*
+         * close log: 
+         *
+         * remove file system from log active file system list.
+         */
+        jfs_flush_journal(log, 2);
+        /*
+         * Make sure all metadata makes it to disk
+         */
+        dbSync(sbi->ipbmap);
+        diSync(sbi->ipimap);
+        /*
+         * Note that we have to do this even if sync_blockdev() will
+         * do exactly the same a few instructions later:  We can't
+         * mark the superblock clean before everything is flushed to
+         * disk.
+         */
+        filemap_fdatawrite(bdev_mapping);
+        filemap_fdatawait(bdev_mapping);
+        updateSuper(sb, FM_CLEAN);
+        /* Restore default gfp_mask for bdev */
+        mapping_set_gfp_mask(bdev_mapping, GFP_USER);
+        return lmLogClose(sb);
+}
diff --git a/fs/jfs/jfs_unicode.c b/fs/jfs/jfs_unicode.c
new file mode 100644
index 000000000000..b32208aad550
--- /dev/null
+++ b/fs/jfs/jfs_unicode.c
@@ -0,0 +1,137 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include "jfs_incore.h"
+#include "jfs_filsys.h"
+#include "jfs_unicode.h"
+#include "jfs_debug.h"
+/*
+ * NAME:        jfs_strfromUCS()
+ *
+ * FUNCTION:    Convert little-endian unicode string to character string
+ *
+ */
+int jfs_strfromUCS_le(char *to, const __le16 * from,
+                      int len, struct nls_table *codepage)
+{
+        int i;
+        int outlen = 0;
+        static int warn_again = 5;      /* Only warn up to 5 times total */
+        int warn = !!warn_again;        /* once per string */
+        if (codepage) {
+                for (i = 0; (i < len) && from[i]; i++) {
+                        int charlen;
+                        charlen =
+                            codepage->uni2char(le16_to_cpu(from[i]),
+                                               &to[outlen],
+                                               NLS_MAX_CHARSET_SIZE);
+                        if (charlen > 0)
+                                outlen += charlen;
+                        else
+                                to[outlen++] = '?';
+                }
+        } else {
+                for (i = 0; (i < len) && from[i]; i++) {
+                        if (le16_to_cpu(from[i]) & 0xff00) {
+                                if (warn) {
+                                        warn--;
+                                        warn_again--;
+                                        printk(KERN_ERR
+                        "non-latin1 character 0x%x found in JFS file name\n", 
+                                               le16_to_cpu(from[i]));
+                                        printk(KERN_ERR
+                                "mount with iocharset=utf8 to access\n");
+                                }
+                                to[i] = '?';
+                        }
+                        else
+                                to[i] = (char) (le16_to_cpu(from[i]));
+                }
+                outlen = i;
+        }
+        to[outlen] = 0;
+        return outlen;
+}
+/*
+ * NAME:        jfs_strtoUCS()
+ *
+ * FUNCTION:    Convert character string to unicode string
+ *
+ */
+static int jfs_strtoUCS(wchar_t * to, const unsigned char *from, int len,
+                struct nls_table *codepage)
+{
+        int charlen;
+        int i;
+        if (codepage) {
+                for (i = 0; len && *from; i++, from += charlen, len -= charlen)
+                {
+                        charlen = codepage->char2uni(from, len, &to[i]);
+                        if (charlen < 1) {
+                                jfs_err("jfs_strtoUCS: char2uni returned %d.",
+                                        charlen);
+                                jfs_err("charset = %s, char = 0x%x",
+                                        codepage->charset, *from);
+                                return charlen;
+                        }
+                }
+        } else {
+                for (i = 0; (i < len) && from[i]; i++)
+                        to[i] = (wchar_t) from[i];
+        }
+        to[i] = 0;
+        return i;
+}
+/*
+ * NAME:        get_UCSname()
+ *
+ * FUNCTION:    Allocate and translate to unicode string
+ *
+ */
+int get_UCSname(struct component_name * uniName, struct dentry *dentry)
+{
+        struct nls_table *nls_tab = JFS_SBI(dentry->d_sb)->nls_tab;
+        int length = dentry->d_name.len;
+        if (length > JFS_NAME_MAX)
+                return -ENAMETOOLONG;
+        uniName->name =
+            kmalloc((length + 1) * sizeof(wchar_t), GFP_NOFS);
+        if (uniName->name == NULL)
+                return -ENOSPC;
+        uniName->namlen = jfs_strtoUCS(uniName->name, dentry->d_name.name,
+                                       length, nls_tab);
+        if (uniName->namlen < 0) {
+                kfree(uniName->name);
+                return uniName->namlen;
+        }
+        return 0;
+}
diff --git a/fs/jfs/jfs_unicode.h b/fs/jfs/jfs_unicode.h
new file mode 100644
index 000000000000..69e25ebe87ac
--- /dev/null
+++ b/fs/jfs/jfs_unicode.h
@@ -0,0 +1,155 @@
+/*
+ *   Copyright (c) International Business Machines Corp., 2000-2002
+ *   Portions Copyright (c) Christoph Hellwig, 2001-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_UNICODE
+#define _H_JFS_UNICODE
+#include <asm/byteorder.h>
+#include "jfs_types.h"
+typedef struct {
+        wchar_t start;
+        wchar_t end;
+        signed char *table;
+} UNICASERANGE;
+extern signed char UniUpperTable[512];
+extern UNICASERANGE UniUpperRange[];
+extern int get_UCSname(struct component_name *, struct dentry *);
+extern int jfs_strfromUCS_le(char *, const __le16 *, int, struct nls_table *);
+#define free_UCSname(COMP) kfree((COMP)->name)
+/*
+ * UniStrcpy:  Copy a string
+ */
+static inline wchar_t *UniStrcpy(wchar_t * ucs1, const wchar_t * ucs2)
+{
+        wchar_t *anchor = ucs1; /* save the start of result string */
+        while ((*ucs1++ = *ucs2++));
+        return anchor;
+}
+/*
+ * UniStrncpy:  Copy length limited string with pad
+ */
+static inline __le16 *UniStrncpy_le(__le16 * ucs1, const __le16 * ucs2,
+                                  size_t n)
+{
+        __le16 *anchor = ucs1;
+        while (n-- && *ucs2)    /* Copy the strings */
+                *ucs1++ = *ucs2++;
+        n++;
+        while (n--)             /* Pad with nulls */
+                *ucs1++ = 0;
+        return anchor;
+}
+/*
+ * UniStrncmp_le:  Compare length limited string - native to little-endian
+ */
+static inline int UniStrncmp_le(const wchar_t * ucs1, const __le16 * ucs2,
+                                size_t n)
+{
+        if (!n)
+                return 0;       /* Null strings are equal */
+        while ((*ucs1 == __le16_to_cpu(*ucs2)) && *ucs1 && --n) {
+                ucs1++;
+                ucs2++;
+        }
+        return (int) *ucs1 - (int) __le16_to_cpu(*ucs2);
+}
+/*
+ * UniStrncpy_to_le:  Copy length limited string with pad to little-endian
+ */
+static inline __le16 *UniStrncpy_to_le(__le16 * ucs1, const wchar_t * ucs2,
+                                       size_t n)
+{
+        __le16 *anchor = ucs1;
+        while (n-- && *ucs2)    /* Copy the strings */
+                *ucs1++ = cpu_to_le16(*ucs2++);
+        n++;
+        while (n--)             /* Pad with nulls */
+                *ucs1++ = 0;
+        return anchor;
+}
+/*
+ * UniStrncpy_from_le:  Copy length limited string with pad from little-endian
+ */
+static inline wchar_t *UniStrncpy_from_le(wchar_t * ucs1, const __le16 * ucs2,
+                                          size_t n)
+{
+        wchar_t *anchor = ucs1;
+        while (n-- && *ucs2)    /* Copy the strings */
+                *ucs1++ = __le16_to_cpu(*ucs2++);
+        n++;
+        while (n--)             /* Pad with nulls */
+                *ucs1++ = 0;
+        return anchor;
+}
+/*
+ * UniToupper:  Convert a unicode character to upper case
+ */
+static inline wchar_t UniToupper(wchar_t uc)
+{
+        UNICASERANGE *rp;
+        if (uc < sizeof(UniUpperTable)) {       /* Latin characters */
+                return uc + UniUpperTable[uc];  /* Use base tables */
+        } else {
+                rp = UniUpperRange;     /* Use range tables */
+                while (rp->start) {
+                        if (uc < rp->start)     /* Before start of range */
+                                return uc;      /* Uppercase = input */
+                        if (uc <= rp->end)      /* In range */
+                                return uc + rp->table[uc - rp->start];
+                        rp++;   /* Try next range */
+                }
+        }
+        return uc;              /* Past last range */
+}
+/*
+ * UniStrupr:  Upper case a unicode string
+ */
+static inline wchar_t *UniStrupr(wchar_t * upin)
+{
+        wchar_t *up;
+        up = upin;
+        while (*up) {           /* For all characters */
+                *up = UniToupper(*up);
+                up++;
+        }
+        return upin;            /* Return input pointer */
+}
+#endif                          /* !_H_JFS_UNICODE */
diff --git a/fs/jfs/jfs_uniupr.c b/fs/jfs/jfs_uniupr.c
new file mode 100644
index 000000000000..4ab185d26308
--- /dev/null
+++ b/fs/jfs/jfs_uniupr.c
@@ -0,0 +1,134 @@
+/*
+ *   Copyright (c) International Business Machines Corp., 2000-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/fs.h>
+#include "jfs_unicode.h"
+/*
+ * Latin upper case
+ */
+signed char UniUpperTable[512] = {
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 000-00f */
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 010-01f */
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 020-02f */
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 030-03f */
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 040-04f */
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 050-05f */
+   0,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 060-06f */
+ -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,  0,  0,  0,  0,  0, /* 070-07f */
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 080-08f */
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 090-09f */
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 0a0-0af */
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 0b0-0bf */
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 0c0-0cf */
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 0d0-0df */
+ -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 0e0-0ef */
+ -32,-32,-32,-32,-32,-32,-32,  0,-32,-32,-32,-32,-32,-32,-32,121, /* 0f0-0ff */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 100-10f */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 110-11f */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 120-12f */
+   0,  0,  0, -1,  0, -1,  0, -1,  0,  0, -1,  0, -1,  0, -1,  0, /* 130-13f */
+  -1,  0, -1,  0, -1,  0, -1,  0, -1,  0,  0, -1,  0, -1,  0, -1, /* 140-14f */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 150-15f */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 160-16f */
+   0, -1,  0, -1,  0, -1,  0, -1,  0,  0, -1,  0, -1,  0, -1,  0, /* 170-17f */
+   0,  0,  0, -1,  0, -1,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0, /* 180-18f */
+   0,  0, -1,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0, /* 190-19f */
+   0, -1,  0, -1,  0, -1,  0,  0, -1,  0,  0,  0,  0, -1,  0,  0, /* 1a0-1af */
+  -1,  0,  0,  0, -1,  0, -1,  0,  0, -1,  0,  0,  0, -1,  0,  0, /* 1b0-1bf */
+   0,  0,  0,  0,  0, -1, -2,  0, -1, -2,  0, -1, -2,  0, -1,  0, /* 1c0-1cf */
+  -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,-79,  0, -1, /* 1d0-1df */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 1e0-1ef */
+   0,  0, -1, -2,  0, -1,  0,  0,  0, -1,  0, -1,  0, -1,  0, -1, /* 1f0-1ff */
+};
+/* Upper case range - Greek */
+static signed char UniCaseRangeU03a0[47] = {
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,-38,-37,-37,-37, /* 3a0-3af */
+   0,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 3b0-3bf */
+ -32,-32,-31,-32,-32,-32,-32,-32,-32,-32,-32,-32,-64,-63,-63,
+};
+/* Upper case range - Cyrillic */
+static signed char UniCaseRangeU0430[48] = {
+ -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 430-43f */
+ -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 440-44f */
+   0,-80,-80,-80,-80,-80,-80,-80,-80,-80,-80,-80,-80,  0,-80,-80, /* 450-45f */
+};
+/* Upper case range - Extended cyrillic */
+static signed char UniCaseRangeU0490[61] = {
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 490-49f */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 4a0-4af */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 4b0-4bf */
+   0,  0, -1,  0, -1,  0,  0,  0, -1,  0,  0,  0, -1,
+};
+/* Upper case range - Extended latin and greek */
+static signed char UniCaseRangeU1e00[509] = {
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 1e00-1e0f */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 1e10-1e1f */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 1e20-1e2f */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 1e30-1e3f */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 1e40-1e4f */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 1e50-1e5f */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 1e60-1e6f */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 1e70-1e7f */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 1e80-1e8f */
+   0, -1,  0, -1,  0, -1,  0,  0,  0,  0,  0,-59,  0, -1,  0, -1, /* 1e90-1e9f */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 1ea0-1eaf */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 1eb0-1ebf */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 1ec0-1ecf */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 1ed0-1edf */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1, /* 1ee0-1eef */
+   0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0,  0,  0,  0,  0,  0, /* 1ef0-1eff */
+   8,  8,  8,  8,  8,  8,  8,  8,  0,  0,  0,  0,  0,  0,  0,  0, /* 1f00-1f0f */
+   8,  8,  8,  8,  8,  8,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 1f10-1f1f */
+   8,  8,  8,  8,  8,  8,  8,  8,  0,  0,  0,  0,  0,  0,  0,  0, /* 1f20-1f2f */
+   8,  8,  8,  8,  8,  8,  8,  8,  0,  0,  0,  0,  0,  0,  0,  0, /* 1f30-1f3f */
+   8,  8,  8,  8,  8,  8,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 1f40-1f4f */
+   0,  8,  0,  8,  0,  8,  0,  8,  0,  0,  0,  0,  0,  0,  0,  0, /* 1f50-1f5f */
+   8,  8,  8,  8,  8,  8,  8,  8,  0,  0,  0,  0,  0,  0,  0,  0, /* 1f60-1f6f */
+  74, 74, 86, 86, 86, 86,100,100,  0,  0,112,112,126,126,  0,  0, /* 1f70-1f7f */
+   8,  8,  8,  8,  8,  8,  8,  8,  0,  0,  0,  0,  0,  0,  0,  0, /* 1f80-1f8f */
+   8,  8,  8,  8,  8,  8,  8,  8,  0,  0,  0,  0,  0,  0,  0,  0, /* 1f90-1f9f */
+   8,  8,  8,  8,  8,  8,  8,  8,  0,  0,  0,  0,  0,  0,  0,  0, /* 1fa0-1faf */
+   8,  8,  0,  9,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 1fb0-1fbf */
+   0,  0,  0,  9,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 1fc0-1fcf */
+   8,  8,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 1fd0-1fdf */
+   8,  8,  0,  0,  0,  7,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 1fe0-1fef */
+   0,  0,  0,  9,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+};
+/* Upper case range - Wide latin */
+static signed char UniCaseRangeUff40[27] = {
+   0,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* ff40-ff4f */
+ -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,
+};
+/*
+ * Upper Case Range
+ */
+UNICASERANGE UniUpperRange[] = {
+    { 0x03a0,  0x03ce,  UniCaseRangeU03a0 },
+    { 0x0430,  0x045f,  UniCaseRangeU0430 },
+    { 0x0490,  0x04cc,  UniCaseRangeU0490 },
+    { 0x1e00,  0x1ffc,  UniCaseRangeU1e00 },
+    { 0xff40,  0xff5a,  UniCaseRangeUff40 },
+    { 0 }
+};
diff --git a/fs/jfs/jfs_xattr.h b/fs/jfs/jfs_xattr.h
new file mode 100644
index 000000000000..a1052f3f0bee
--- /dev/null
+++ b/fs/jfs/jfs_xattr.h
@@ -0,0 +1,64 @@
+/*
+ *   Copyright (c) International Business Machines Corp., 2000-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef H_JFS_XATTR
+#define H_JFS_XATTR
+/*
+ * jfs_ea_list describe the on-disk format of the extended attributes.
+ * I know the null-terminator is redundant since namelen is stored, but
+ * I am maintaining compatibility with OS/2 where possible.
+ */
+struct jfs_ea {
+        u8 flag;        /* Unused? */
+        u8 namelen;     /* Length of name */
+        __le16 valuelen;        /* Length of value */
+        char name[0];   /* Attribute name (includes null-terminator) */
+};                      /* Value immediately follows name */
+struct jfs_ea_list {
+        __le32 size;            /* overall size */
+        struct jfs_ea ea[0];    /* Variable length list */
+};
+/* Macros for defining maxiumum number of bytes supported for EAs */
+#define MAXEASIZE       65535
+#define MAXEALISTSIZE   MAXEASIZE
+/*
+ * some macros for dealing with variable length EA lists.
+ */
+#define EA_SIZE(ea) \
+        (sizeof (struct jfs_ea) + (ea)->namelen + 1 + \
+         le16_to_cpu((ea)->valuelen))
+#define NEXT_EA(ea) ((struct jfs_ea *) (((char *) (ea)) + (EA_SIZE (ea))))
+#define FIRST_EA(ealist) ((ealist)->ea)
+#define EALIST_SIZE(ealist) le32_to_cpu((ealist)->size)
+#define END_EALIST(ealist) \
+        ((struct jfs_ea *) (((char *) (ealist)) + EALIST_SIZE(ealist)))
+extern int __jfs_setxattr(struct inode *, const char *, const void *, size_t,
+                          int);
+extern int jfs_setxattr(struct dentry *, const char *, const void *, size_t,
+                        int);
+extern ssize_t __jfs_getxattr(struct inode *, const char *, void *, size_t);
+extern ssize_t jfs_getxattr(struct dentry *, const char *, void *, size_t);
+extern ssize_t jfs_listxattr(struct dentry *, char *, size_t);
+extern int jfs_removexattr(struct dentry *, const char *);
+#endif  /* H_JFS_XATTR */
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
new file mode 100644
index 000000000000..11c58c54b818
--- /dev/null
+++ b/fs/jfs/jfs_xtree.c
@@ -0,0 +1,4485 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+/*
+ *      jfs_xtree.c: extent allocation descriptor B+-tree manager
+ */
+#include <linux/fs.h>
+#include <linux/quotaops.h>
+#include "jfs_incore.h"
+#include "jfs_filsys.h"
+#include "jfs_metapage.h"
+#include "jfs_dmap.h"
+#include "jfs_dinode.h"
+#include "jfs_superblock.h"
+#include "jfs_debug.h"
+/*
+ * xtree local flag
+ */
+#define XT_INSERT       0x00000001
+/*
+ *       xtree key/entry comparison: extent offset
+ *
+ * return:
+ *      -1: k < start of extent
+ *       0: start_of_extent <= k <= end_of_extent
+ *       1: k > end_of_extent
+ */
+#define XT_CMP(CMP, K, X, OFFSET64)\
+{\
+        OFFSET64 = offsetXAD(X);\
+        (CMP) = ((K) >= OFFSET64 + lengthXAD(X)) ? 1 :\
+              ((K) < OFFSET64) ? -1 : 0;\
+}
+/* write a xad entry */
+#define XT_PUTENTRY(XAD, FLAG, OFF, LEN, ADDR)\
+{\
+        (XAD)->flag = (FLAG);\
+        XADoffset((XAD), (OFF));\
+        XADlength((XAD), (LEN));\
+        XADaddress((XAD), (ADDR));\
+}
+#define XT_PAGE(IP, MP) BT_PAGE(IP, MP, xtpage_t, i_xtroot)
+/* get page buffer for specified block address */
+/* ToDo: Replace this ugly macro with a function */
+#define XT_GETPAGE(IP, BN, MP, SIZE, P, RC)\
+{\
+        BT_GETPAGE(IP, BN, MP, xtpage_t, SIZE, P, RC, i_xtroot)\
+        if (!(RC))\
+        {\
+                if ((le16_to_cpu((P)->header.nextindex) < XTENTRYSTART) ||\
+                    (le16_to_cpu((P)->header.nextindex) > le16_to_cpu((P)->header.maxentry)) ||\
+                    (le16_to_cpu((P)->header.maxentry) > (((BN)==0)?XTROOTMAXSLOT:PSIZE>>L2XTSLOTSIZE)))\
+                {\
+                        jfs_error((IP)->i_sb, "XT_GETPAGE: xtree page corrupt");\
+                        BT_PUTPAGE(MP);\
+                        MP = NULL;\
+                        RC = -EIO;\
+                }\
+        }\
+}
+/* for consistency */
+#define XT_PUTPAGE(MP) BT_PUTPAGE(MP)
+#define XT_GETSEARCH(IP, LEAF, BN, MP,  P, INDEX) \
+        BT_GETSEARCH(IP, LEAF, BN, MP, xtpage_t, P, INDEX, i_xtroot)
+/* xtree entry parameter descriptor */
+struct xtsplit {
+        struct metapage *mp;
+        s16 index;
+        u8 flag;
+        s64 off;
+        s64 addr;
+        int len;
+        struct pxdlist *pxdlist;
+};
+/*
+ *      statistics
+ */
+#ifdef CONFIG_JFS_STATISTICS
+static struct {
+        uint search;
+        uint fastSearch;
+        uint split;
+} xtStat;
+#endif
+/*
+ * forward references
+ */
+static int xtSearch(struct inode *ip,
+                    s64 xoff, int *cmpp, struct btstack * btstack, int flag);
+static int xtSplitUp(tid_t tid,
+                     struct inode *ip,
+                     struct xtsplit * split, struct btstack * btstack);
+static int xtSplitPage(tid_t tid, struct inode *ip, struct xtsplit * split,
+                       struct metapage ** rmpp, s64 * rbnp);
+static int xtSplitRoot(tid_t tid, struct inode *ip,
+                       struct xtsplit * split, struct metapage ** rmpp);
+#ifdef _STILL_TO_PORT
+static int xtDeleteUp(tid_t tid, struct inode *ip, struct metapage * fmp,
+                      xtpage_t * fp, struct btstack * btstack);
+static int xtSearchNode(struct inode *ip,
+                        xad_t * xad,
+                        int *cmpp, struct btstack * btstack, int flag);
+static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * fp);
+#endif                          /*  _STILL_TO_PORT */
+/* External references */
+/*
+ *      debug control
+ */
+/*      #define _JFS_DEBUG_XTREE        1 */
+/*
+ *      xtLookup()
+ *
+ * function: map a single page into a physical extent;
+ */
+int xtLookup(struct inode *ip, s64 lstart,
+             s64 llen, int *pflag, s64 * paddr, s32 * plen, int no_check)
+{
+        int rc = 0;
+        struct btstack btstack;
+        int cmp;
+        s64 bn;
+        struct metapage *mp;
+        xtpage_t *p;
+        int index;
+        xad_t *xad;
+        s64 size, xoff, xend;
+        int xlen;
+        s64 xaddr;
+        *plen = 0;
+        if (!no_check) {
+                /* is lookup offset beyond eof ? */
+                size = ((u64) ip->i_size + (JFS_SBI(ip->i_sb)->bsize - 1)) >>
+                    JFS_SBI(ip->i_sb)->l2bsize;
+                if (lstart >= size) {
+                        jfs_err("xtLookup: lstart (0x%lx) >= size (0x%lx)",
+                                (ulong) lstart, (ulong) size);
+                        return 0;
+                }
+        }
+        /*
+         * search for the xad entry covering the logical extent
+         */
+//search:
+        if ((rc = xtSearch(ip, lstart, &cmp, &btstack, 0))) {
+                jfs_err("xtLookup: xtSearch returned %d", rc);
+                return rc;
+        }
+        /*
+         *      compute the physical extent covering logical extent
+         *
+         * N.B. search may have failed (e.g., hole in sparse file),
+         * and returned the index of the next entry.
+         */
+        /* retrieve search result */
+        XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
+        /* is xad found covering start of logical extent ?
+         * lstart is a page start address,
+         * i.e., lstart cannot start in a hole;
+         */
+        if (cmp)
+                goto out;
+        /*
+         * lxd covered by xad
+         */
+        xad = &p->xad[index];
+        xoff = offsetXAD(xad);
+        xlen = lengthXAD(xad);
+        xend = xoff + xlen;
+        xaddr = addressXAD(xad);
+        /* initialize new pxd */
+        *pflag = xad->flag;
+        *paddr = xaddr + (lstart - xoff);
+        /* a page must be fully covered by an xad */
+        *plen = min(xend - lstart, llen);
+      out:
+        XT_PUTPAGE(mp);
+        return rc;
+}
+/*
+ *      xtLookupList()
+ *
+ * function: map a single logical extent into a list of physical extent;
+ *
+ * parameter:
+ *      struct inode    *ip,
+ *      struct lxdlist  *lxdlist,       lxd list (in)
+ *      struct xadlist  *xadlist,       xad list (in/out)
+ *      int             flag)
+ *
+ * coverage of lxd by xad under assumption of
+ * . lxd's are ordered and disjoint.
+ * . xad's are ordered and disjoint.
+ *
+ * return:
+ *      0:      success
+ *
+ * note: a page being written (even a single byte) is backed fully,
+ *      except the last page which is only backed with blocks
+ *      required to cover the last byte;
+ *      the extent backing a page is fully contained within an xad;
+ */
+int xtLookupList(struct inode *ip, struct lxdlist * lxdlist,
+                 struct xadlist * xadlist, int flag)
+{
+        int rc = 0;
+        struct btstack btstack;
+        int cmp;
+        s64 bn;
+        struct metapage *mp;
+        xtpage_t *p;
+        int index;
+        lxd_t *lxd;
+        xad_t *xad, *pxd;
+        s64 size, lstart, lend, xstart, xend, pstart;
+        s64 llen, xlen, plen;
+        s64 xaddr, paddr;
+        int nlxd, npxd, maxnpxd;
+        npxd = xadlist->nxad = 0;
+        maxnpxd = xadlist->maxnxad;
+        pxd = xadlist->xad;
+        nlxd = lxdlist->nlxd;
+        lxd = lxdlist->lxd;
+        lstart = offsetLXD(lxd);
+        llen = lengthLXD(lxd);
+        lend = lstart + llen;
+        size = (ip->i_size + (JFS_SBI(ip->i_sb)->bsize - 1)) >>
+            JFS_SBI(ip->i_sb)->l2bsize;
+        /*
+         * search for the xad entry covering the logical extent
+         */
+      search:
+        if (lstart >= size)
+                return 0;
+        if ((rc = xtSearch(ip, lstart, &cmp, &btstack, 0)))
+                return rc;
+        /*
+         *      compute the physical extent covering logical extent
+         *
+         * N.B. search may have failed (e.g., hole in sparse file),
+         * and returned the index of the next entry.
+         */
+//map:
+        /* retrieve search result */
+        XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
+        /* is xad on the next sibling page ? */
+        if (index == le16_to_cpu(p->header.nextindex)) {
+                if (p->header.flag & BT_ROOT)
+                        goto mapend;
+                if ((bn = le64_to_cpu(p->header.next)) == 0)
+                        goto mapend;
+                XT_PUTPAGE(mp);
+                /* get next sibling page */
+                XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+                if (rc)
+                        return rc;
+                index = XTENTRYSTART;
+        }
+        xad = &p->xad[index];
+        /*
+         * is lxd covered by xad ?
+         */
+      compare:
+        xstart = offsetXAD(xad);
+        xlen = lengthXAD(xad);
+        xend = xstart + xlen;
+        xaddr = addressXAD(xad);
+      compare1:
+        if (xstart < lstart)
+                goto compare2;
+        /* (lstart <= xstart) */
+        /* lxd is NOT covered by xad */
+        if (lend <= xstart) {
+                /*
+                 * get next lxd
+                 */
+                if (--nlxd == 0)
+                        goto mapend;
+                lxd++;
+                lstart = offsetLXD(lxd);
+                llen = lengthLXD(lxd);
+                lend = lstart + llen;
+                if (lstart >= size)
+                        goto mapend;
+                /* compare with the current xad  */
+                goto compare1;
+        }
+        /* lxd is covered by xad */
+        else {                  /* (xstart < lend) */
+                /* initialize new pxd */
+                pstart = xstart;
+                plen = min(lend - xstart, xlen);
+                paddr = xaddr;
+                goto cover;
+        }
+        /* (xstart < lstart) */
+      compare2:
+        /* lxd is covered by xad */
+        if (lstart < xend) {
+                /* initialize new pxd */
+                pstart = lstart;
+                plen = min(xend - lstart, llen);
+                paddr = xaddr + (lstart - xstart);
+                goto cover;
+        }
+        /* lxd is NOT covered by xad */
+        else {                  /* (xend <= lstart) */
+                /*
+                 * get next xad
+                 *
+                 * linear search next xad covering lxd on
+                 * the current xad page, and then tree search
+                 */
+                if (index == le16_to_cpu(p->header.nextindex) - 1) {
+                        if (p->header.flag & BT_ROOT)
+                                goto mapend;
+                        XT_PUTPAGE(mp);
+                        goto search;
+                } else {
+                        index++;
+                        xad++;
+                        /* compare with new xad */
+                        goto compare;
+                }
+        }
+        /*
+         * lxd is covered by xad and a new pxd has been initialized
+         * (lstart <= xstart < lend) or (xstart < lstart < xend)
+         */
+      cover:
+        /* finalize pxd corresponding to current xad */
+        XT_PUTENTRY(pxd, xad->flag, pstart, plen, paddr);
+        if (++npxd >= maxnpxd)
+                goto mapend;
+        pxd++;
+        /*
+         * lxd is fully covered by xad
+         */
+        if (lend <= xend) {
+                /*
+                 * get next lxd
+                 */
+                if (--nlxd == 0)
+                        goto mapend;
+                lxd++;
+                lstart = offsetLXD(lxd);
+                llen = lengthLXD(lxd);
+                lend = lstart + llen;
+                if (lstart >= size)
+                        goto mapend;
+                /*
+                 * test for old xad covering new lxd
+                 * (old xstart < new lstart)
+                 */
+                goto compare2;
+        }
+        /*
+         * lxd is partially covered by xad
+         */
+        else {                  /* (xend < lend)  */
+                /*
+                 * get next xad
+                 *
+                 * linear search next xad covering lxd on
+                 * the current xad page, and then next xad page search
+                 */
+                if (index == le16_to_cpu(p->header.nextindex) - 1) {
+                        if (p->header.flag & BT_ROOT)
+                                goto mapend;
+                        if ((bn = le64_to_cpu(p->header.next)) == 0)
+                                goto mapend;
+                        XT_PUTPAGE(mp);
+                        /* get next sibling page */
+                        XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+                        if (rc)
+                                return rc;
+                        index = XTENTRYSTART;
+                        xad = &p->xad[index];
+                } else {
+                        index++;
+                        xad++;
+                }
+                /*
+                 * test for new xad covering old lxd
+                 * (old lstart < new xstart)
+                 */
+                goto compare;
+        }
+      mapend:
+        xadlist->nxad = npxd;
+//out:
+        XT_PUTPAGE(mp);
+        return rc;
+}
+/*
+ *      xtSearch()
+ *
+ * function:    search for the xad entry covering specified offset.
+ *
+ * parameters:
+ *      ip      - file object;
+ *      xoff    - extent offset;
+ *      cmpp    - comparison result:
+ *      btstack - traverse stack;
+ *      flag    - search process flag (XT_INSERT);
+ *
+ * returns:
+ *      btstack contains (bn, index) of search path traversed to the entry.
+ *      *cmpp is set to result of comparison with the entry returned.
+ *      the page containing the entry is pinned at exit.
+ */
+static int xtSearch(struct inode *ip, s64 xoff, /* offset of extent */
+                    int *cmpp, struct btstack * btstack, int flag)
+{
+        struct jfs_inode_info *jfs_ip = JFS_IP(ip);
+        int rc = 0;
+        int cmp = 1;            /* init for empty page */
+        s64 bn;                 /* block number */
+        struct metapage *mp;    /* page buffer */
+        xtpage_t *p;            /* page */
+        xad_t *xad;
+        int base, index, lim, btindex;
+        struct btframe *btsp;
+        int nsplit = 0;         /* number of pages to split */
+        s64 t64;
+        INCREMENT(xtStat.search);
+        BT_CLR(btstack);
+        btstack->nsplit = 0;
+        /*
+         *      search down tree from root:
+         *
+         * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of
+         * internal page, child page Pi contains entry with k, Ki <= K < Kj.
+         *
+         * if entry with search key K is not found
+         * internal page search find the entry with largest key Ki
+         * less than K which point to the child page to search;
+         * leaf page search find the entry with smallest key Kj
+         * greater than K so that the returned index is the position of
+         * the entry to be shifted right for insertion of new entry.
+         * for empty tree, search key is greater than any key of the tree.
+         *
+         * by convention, root bn = 0.
+         */
+        for (bn = 0;;) {
+                /* get/pin the page to search */
+                XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+                if (rc)
+                        return rc;
+                /* try sequential access heuristics with the previous
+                 * access entry in target leaf page:
+                 * once search narrowed down into the target leaf,
+                 * key must either match an entry in the leaf or
+                 * key entry does not exist in the tree;
+                 */
+//fastSearch:
+                if ((jfs_ip->btorder & BT_SEQUENTIAL) &&
+                    (p->header.flag & BT_LEAF) &&
+                    (index = jfs_ip->btindex) <
+                    le16_to_cpu(p->header.nextindex)) {
+                        xad = &p->xad[index];
+                        t64 = offsetXAD(xad);
+                        if (xoff < t64 + lengthXAD(xad)) {
+                                if (xoff >= t64) {
+                                        *cmpp = 0;
+                                        goto out;
+                                }
+                                /* stop sequential access heuristics */
+                                goto binarySearch;
+                        } else {        /* (t64 + lengthXAD(xad)) <= xoff */
+                                /* try next sequential entry */
+                                index++;
+                                if (index <
+                                    le16_to_cpu(p->header.nextindex)) {
+                                        xad++;
+                                        t64 = offsetXAD(xad);
+                                        if (xoff < t64 + lengthXAD(xad)) {
+                                                if (xoff >= t64) {
+                                                        *cmpp = 0;
+                                                        goto out;
+                                                }
+                                                /* miss: key falls between
+                                                 * previous and this entry
+                                                 */
+                                                *cmpp = 1;
+                                                goto out;
+                                        }
+                                        /* (xoff >= t64 + lengthXAD(xad));
+                                         * matching entry may be further out:
+                                         * stop heuristic search
+                                         */
+                                        /* stop sequential access heuristics */
+                                        goto binarySearch;
+                                }
+                                /* (index == p->header.nextindex);
+                                 * miss: key entry does not exist in
+                                 * the target leaf/tree
+                                 */
+                                *cmpp = 1;
+                                goto out;
+                        }
+                        /*
+                         * if hit, return index of the entry found, and
+                         * if miss, where new entry with search key is
+                         * to be inserted;
+                         */
+                      out:
+                        /* compute number of pages to split */
+                        if (flag & XT_INSERT) {
+                                if (p->header.nextindex ==      /* little-endian */
+                                    p->header.maxentry)
+                                        nsplit++;
+                                else
+                                        nsplit = 0;
+                                btstack->nsplit = nsplit;
+                        }
+                        /* save search result */
+                        btsp = btstack->top;
+                        btsp->bn = bn;
+                        btsp->index = index;
+                        btsp->mp = mp;
+                        /* update sequential access heuristics */
+                        jfs_ip->btindex = index;
+                        INCREMENT(xtStat.fastSearch);
+                        return 0;
+                }
+                /* well, ... full search now */
+              binarySearch:
+                lim = le16_to_cpu(p->header.nextindex) - XTENTRYSTART;
+                /*
+                 * binary search with search key K on the current page
+                 */
+                for (base = XTENTRYSTART; lim; lim >>= 1) {
+                        index = base + (lim >> 1);
+                        XT_CMP(cmp, xoff, &p->xad[index], t64);
+                        if (cmp == 0) {
+                                /*
+                                 *      search hit
+                                 */
+                                /* search hit - leaf page:
+                                 * return the entry found
+                                 */
+                                if (p->header.flag & BT_LEAF) {
+                                        *cmpp = cmp;
+                                        /* compute number of pages to split */
+                                        if (flag & XT_INSERT) {
+                                                if (p->header.nextindex ==
+                                                    p->header.maxentry)
+                                                        nsplit++;
+                                                else
+                                                        nsplit = 0;
+                                                btstack->nsplit = nsplit;
+                                        }
+                                        /* save search result */
+                                        btsp = btstack->top;
+                                        btsp->bn = bn;
+                                        btsp->index = index;
+                                        btsp->mp = mp;
+                                        /* init sequential access heuristics */
+                                        btindex = jfs_ip->btindex;
+                                        if (index == btindex ||
+                                            index == btindex + 1)
+                                                jfs_ip->btorder = BT_SEQUENTIAL;
+                                        else
+                                                jfs_ip->btorder = BT_RANDOM;
+                                        jfs_ip->btindex = index;
+                                        return 0;
+                                }
+                                /* search hit - internal page:
+                                 * descend/search its child page
+                                 */
+                                goto next;
+                        }
+                        if (cmp > 0) {
+                                base = index + 1;
+                                --lim;
+                        }
+                }
+                /*
+                 *      search miss
+                 *
+                 * base is the smallest index with key (Kj) greater than
+                 * search key (K) and may be zero or maxentry index.
+                 */
+                /*
+                 * search miss - leaf page:
+                 *
+                 * return location of entry (base) where new entry with
+                 * search key K is to be inserted.
+                 */
+                if (p->header.flag & BT_LEAF) {
+                        *cmpp = cmp;
+                        /* compute number of pages to split */
+                        if (flag & XT_INSERT) {
+                                if (p->header.nextindex ==
+                                    p->header.maxentry)
+                                        nsplit++;
+                                else
+                                        nsplit = 0;
+                                btstack->nsplit = nsplit;
+                        }
+                        /* save search result */
+                        btsp = btstack->top;
+                        btsp->bn = bn;
+                        btsp->index = base;
+                        btsp->mp = mp;
+                        /* init sequential access heuristics */
+                        btindex = jfs_ip->btindex;
+                        if (base == btindex || base == btindex + 1)
+                                jfs_ip->btorder = BT_SEQUENTIAL;
+                        else
+                                jfs_ip->btorder = BT_RANDOM;
+                        jfs_ip->btindex = base;
+                        return 0;
+                }
+                /*
+                 * search miss - non-leaf page:
+                 *
+                 * if base is non-zero, decrement base by one to get the parent
+                 * entry of the child page to search.
+                 */
+                index = base ? base - 1 : base;
+                /*
+                 * go down to child page
+                 */
+              next:
+                /* update number of pages to split */
+                if (p->header.nextindex == p->header.maxentry)
+                        nsplit++;
+                else
+                        nsplit = 0;
+                /* push (bn, index) of the parent page/entry */
+                BT_PUSH(btstack, bn, index);
+                /* get the child page block number */
+                bn = addressXAD(&p->xad[index]);
+                /* unpin the parent page */
+                XT_PUTPAGE(mp);
+        }
+}
+/*
+ *      xtInsert()
+ *
+ * function:
+ *
+ * parameter:
+ *      tid     - transaction id;
+ *      ip      - file object;
+ *      xflag   - extent flag (XAD_NOTRECORDED):
+ *      xoff    - extent offset;
+ *      xlen    - extent length;
+ *      xaddrp  - extent address pointer (in/out):
+ *              if (*xaddrp)
+ *                      caller allocated data extent at *xaddrp;
+ *              else
+ *                      allocate data extent and return its xaddr;
+ *      flag    -
+ *
+ * return:
+ */
+int xtInsert(tid_t tid,         /* transaction id */
+             struct inode *ip, int xflag, s64 xoff, s32 xlen, s64 * xaddrp,
+             int flag)
+{
+        int rc = 0;
+        s64 xaddr, hint;
+        struct metapage *mp;    /* meta-page buffer */
+        xtpage_t *p;            /* base B+-tree index page */
+        s64 bn;
+        int index, nextindex;
+        struct btstack btstack; /* traverse stack */
+        struct xtsplit split;   /* split information */
+        xad_t *xad;
+        int cmp;
+        struct tlock *tlck;
+        struct xtlock *xtlck;
+        jfs_info("xtInsert: nxoff:0x%lx nxlen:0x%x", (ulong) xoff, xlen);
+        /*
+         *      search for the entry location at which to insert:
+         *
+         * xtFastSearch() and xtSearch() both returns (leaf page
+         * pinned, index at which to insert).
+         * n.b. xtSearch() may return index of maxentry of
+         * the full page.
+         */
+        if ((rc = xtSearch(ip, xoff, &cmp, &btstack, XT_INSERT)))
+                return rc;
+        /* retrieve search result */
+        XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
+        /* This test must follow XT_GETSEARCH since mp must be valid if
+         * we branch to out: */
+        if (cmp == 0) {
+                rc = -EEXIST;
+                goto out;
+        }
+        /*
+         * allocate data extent requested
+         *
+         * allocation hint: last xad
+         */
+        if ((xaddr = *xaddrp) == 0) {
+                if (index > XTENTRYSTART) {
+                        xad = &p->xad[index - 1];
+                        hint = addressXAD(xad) + lengthXAD(xad) - 1;
+                } else
+                        hint = 0;
+                if ((rc = DQUOT_ALLOC_BLOCK(ip, xlen)))
+                        goto out;
+                if ((rc = dbAlloc(ip, hint, (s64) xlen, &xaddr))) {
+                        DQUOT_FREE_BLOCK(ip, xlen);
+                        goto out;
+                }
+        }
+        /*
+         *      insert entry for new extent
+         */
+        xflag |= XAD_NEW;
+        /*
+         *      if the leaf page is full, split the page and
+         *      propagate up the router entry for the new page from split
+         *
+         * The xtSplitUp() will insert the entry and unpin the leaf page.
+         */
+        nextindex = le16_to_cpu(p->header.nextindex);
+        if (nextindex == le16_to_cpu(p->header.maxentry)) {
+                split.mp = mp;
+                split.index = index;
+                split.flag = xflag;
+                split.off = xoff;
+                split.len = xlen;
+                split.addr = xaddr;
+                split.pxdlist = NULL;
+                if ((rc = xtSplitUp(tid, ip, &split, &btstack))) {
+                        /* undo data extent allocation */
+                        if (*xaddrp == 0) {
+                                dbFree(ip, xaddr, (s64) xlen);
+                                DQUOT_FREE_BLOCK(ip, xlen);
+                        }
+                        return rc;
+                }
+                *xaddrp = xaddr;
+                return 0;
+        }
+        /*
+         *      insert the new entry into the leaf page
+         */
+        /*
+         * acquire a transaction lock on the leaf page;
+         *
+         * action: xad insertion/extension;
+         */
+        BT_MARK_DIRTY(mp, ip);
+        /* if insert into middle, shift right remaining entries. */
+        if (index < nextindex)
+                memmove(&p->xad[index + 1], &p->xad[index],
+                        (nextindex - index) * sizeof(xad_t));
+        /* insert the new entry: mark the entry NEW */
+        xad = &p->xad[index];
+        XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr);
+        /* advance next available entry index */
+        p->header.nextindex =
+            cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
+        /* Don't log it if there are no links to the file */
+        if (!test_cflag(COMMIT_Nolink, ip)) {
+                tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
+                xtlck = (struct xtlock *) & tlck->lock;
+                xtlck->lwm.offset =
+                    (xtlck->lwm.offset) ? min(index,
+                                              (int)xtlck->lwm.offset) : index;
+                xtlck->lwm.length =
+                    le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset;
+        }
+        *xaddrp = xaddr;
+      out:
+        /* unpin the leaf page */
+        XT_PUTPAGE(mp);
+        return rc;
+}
+/*
+ *      xtSplitUp()
+ *
+ * function:
+ *      split full pages as propagating insertion up the tree
+ *
+ * parameter:
+ *      tid     - transaction id;
+ *      ip      - file object;
+ *      split   - entry parameter descriptor;
+ *      btstack - traverse stack from xtSearch()
+ *
+ * return:
+ */
+static int
+xtSplitUp(tid_t tid,
+          struct inode *ip, struct xtsplit * split, struct btstack * btstack)
+{
+        int rc = 0;
+        struct metapage *smp;
+        xtpage_t *sp;           /* split page */
+        struct metapage *rmp;
+        s64 rbn;                /* new right page block number */
+        struct metapage *rcmp;
+        xtpage_t *rcp;          /* right child page */
+        s64 rcbn;               /* right child page block number */
+        int skip;               /* index of entry of insertion */
+        int nextindex;          /* next available entry index of p */
+        struct btframe *parent; /* parent page entry on traverse stack */
+        xad_t *xad;
+        s64 xaddr;
+        int xlen;
+        int nsplit;             /* number of pages split */
+        struct pxdlist pxdlist;
+        pxd_t *pxd;
+        struct tlock *tlck;
+        struct xtlock *xtlck;
+        smp = split->mp;
+        sp = XT_PAGE(ip, smp);
+        /* is inode xtree root extension/inline EA area free ? */
+        if ((sp->header.flag & BT_ROOT) && (!S_ISDIR(ip->i_mode)) &&
+            (le16_to_cpu(sp->header.maxentry) < XTROOTMAXSLOT) &&
+            (JFS_IP(ip)->mode2 & INLINEEA)) {
+                sp->header.maxentry = cpu_to_le16(XTROOTMAXSLOT);
+                JFS_IP(ip)->mode2 &= ~INLINEEA;
+                BT_MARK_DIRTY(smp, ip);
+                /*
+                 * acquire a transaction lock on the leaf page;
+                 *
+                 * action: xad insertion/extension;
+                 */
+                /* if insert into middle, shift right remaining entries. */
+                skip = split->index;
+                nextindex = le16_to_cpu(sp->header.nextindex);
+                if (skip < nextindex)
+                        memmove(&sp->xad[skip + 1], &sp->xad[skip],
+                                (nextindex - skip) * sizeof(xad_t));
+                /* insert the new entry: mark the entry NEW */
+                xad = &sp->xad[skip];
+                XT_PUTENTRY(xad, split->flag, split->off, split->len,
+                            split->addr);
+                /* advance next available entry index */
+                sp->header.nextindex =
+                    cpu_to_le16(le16_to_cpu(sp->header.nextindex) + 1);
+                /* Don't log it if there are no links to the file */
+                if (!test_cflag(COMMIT_Nolink, ip)) {
+                        tlck = txLock(tid, ip, smp, tlckXTREE | tlckGROW);
+                        xtlck = (struct xtlock *) & tlck->lock;
+                        xtlck->lwm.offset = (xtlck->lwm.offset) ?
+                            min(skip, (int)xtlck->lwm.offset) : skip;
+                        xtlck->lwm.length =
+                            le16_to_cpu(sp->header.nextindex) -
+                            xtlck->lwm.offset;
+                }
+                return 0;
+        }
+        /*
+         * allocate new index blocks to cover index page split(s)
+         *
+         * allocation hint: ?
+         */
+        if (split->pxdlist == NULL) {
+                nsplit = btstack->nsplit;
+                split->pxdlist = &pxdlist;
+                pxdlist.maxnpxd = pxdlist.npxd = 0;
+                pxd = &pxdlist.pxd[0];
+                xlen = JFS_SBI(ip->i_sb)->nbperpage;
+                for (; nsplit > 0; nsplit--, pxd++) {
+                        if ((rc = dbAlloc(ip, (s64) 0, (s64) xlen, &xaddr))
+                            == 0) {
+                                PXDaddress(pxd, xaddr);
+                                PXDlength(pxd, xlen);
+                                pxdlist.maxnpxd++;
+                                continue;
+                        }
+                        /* undo allocation */
+                        XT_PUTPAGE(smp);
+                        return rc;
+                }
+        }
+        /*
+         * Split leaf page <sp> into <sp> and a new right page <rp>.
+         *
+         * The split routines insert the new entry into the leaf page,
+         * and acquire txLock as appropriate.
+         * return <rp> pinned and its block number <rpbn>.
+         */
+        rc = (sp->header.flag & BT_ROOT) ?
+            xtSplitRoot(tid, ip, split, &rmp) :
+            xtSplitPage(tid, ip, split, &rmp, &rbn);
+        XT_PUTPAGE(smp);
+        if (rc)
+                return -EIO;
+        /*
+         * propagate up the router entry for the leaf page just split
+         *
+         * insert a router entry for the new page into the parent page,
+         * propagate the insert/split up the tree by walking back the stack
+         * of (bn of parent page, index of child page entry in parent page)
+         * that were traversed during the search for the page that split.
+         *
+         * the propagation of insert/split up the tree stops if the root
+         * splits or the page inserted into doesn't have to split to hold
+         * the new entry.
+         *
+         * the parent entry for the split page remains the same, and
+         * a new entry is inserted at its right with the first key and
+         * block number of the new right page.
+         *
+         * There are a maximum of 3 pages pinned at any time:
+         * right child, left parent and right parent (when the parent splits)
+         * to keep the child page pinned while working on the parent.
+         * make sure that all pins are released at exit.
+         */
+        while ((parent = BT_POP(btstack)) != NULL) {
+                /* parent page specified by stack frame <parent> */
+                /* keep current child pages <rcp> pinned */
+                rcmp = rmp;
+                rcbn = rbn;
+                rcp = XT_PAGE(ip, rcmp);
+                /*
+                 * insert router entry in parent for new right child page <rp>
+                 */
+                /* get/pin the parent page <sp> */
+                XT_GETPAGE(ip, parent->bn, smp, PSIZE, sp, rc);
+                if (rc) {
+                        XT_PUTPAGE(rcmp);
+                        return rc;
+                }
+                /*
+                 * The new key entry goes ONE AFTER the index of parent entry,
+                 * because the split was to the right.
+                 */
+                skip = parent->index + 1;
+                /*
+                 * split or shift right remaining entries of the parent page
+                 */
+                nextindex = le16_to_cpu(sp->header.nextindex);
+                /*
+                 * parent page is full - split the parent page
+                 */
+                if (nextindex == le16_to_cpu(sp->header.maxentry)) {
+                        /* init for parent page split */
+                        split->mp = smp;
+                        split->index = skip;    /* index at insert */
+                        split->flag = XAD_NEW;
+                        split->off = offsetXAD(&rcp->xad[XTENTRYSTART]);
+                        split->len = JFS_SBI(ip->i_sb)->nbperpage;
+                        split->addr = rcbn;
+                        /* unpin previous right child page */
+                        XT_PUTPAGE(rcmp);
+                        /* The split routines insert the new entry,
+                         * and acquire txLock as appropriate.
+                         * return <rp> pinned and its block number <rpbn>.
+                         */
+                        rc = (sp->header.flag & BT_ROOT) ?
+                            xtSplitRoot(tid, ip, split, &rmp) :
+                            xtSplitPage(tid, ip, split, &rmp, &rbn);
+                        if (rc) {
+                                XT_PUTPAGE(smp);
+                                return rc;
+                        }
+                        XT_PUTPAGE(smp);
+                        /* keep new child page <rp> pinned */
+                }
+                /*
+                 * parent page is not full - insert in parent page
+                 */
+                else {
+                        /*
+                         * insert router entry in parent for the right child
+                         * page from the first entry of the right child page:
+                         */
+                        /*
+                         * acquire a transaction lock on the parent page;
+                         *
+                         * action: router xad insertion;
+                         */
+                        BT_MARK_DIRTY(smp, ip);
+                        /*
+                         * if insert into middle, shift right remaining entries
+                         */
+                        if (skip < nextindex)
+                                memmove(&sp->xad[skip + 1], &sp->xad[skip],
+                                        (nextindex -
+                                         skip) << L2XTSLOTSIZE);
+                        /* insert the router entry */
+                        xad = &sp->xad[skip];
+                        XT_PUTENTRY(xad, XAD_NEW,
+                                    offsetXAD(&rcp->xad[XTENTRYSTART]),
+                                    JFS_SBI(ip->i_sb)->nbperpage, rcbn);
+                        /* advance next available entry index. */
+                        sp->header.nextindex =
+                            cpu_to_le16(le16_to_cpu(sp->header.nextindex) +
+                                        1);
+                        /* Don't log it if there are no links to the file */
+                        if (!test_cflag(COMMIT_Nolink, ip)) {
+                                tlck = txLock(tid, ip, smp,
+                                              tlckXTREE | tlckGROW);
+                                xtlck = (struct xtlock *) & tlck->lock;
+                                xtlck->lwm.offset = (xtlck->lwm.offset) ?
+                                    min(skip, (int)xtlck->lwm.offset) : skip;
+                                xtlck->lwm.length =
+                                    le16_to_cpu(sp->header.nextindex) -
+                                    xtlck->lwm.offset;
+                        }
+                        /* unpin parent page */
+                        XT_PUTPAGE(smp);
+                        /* exit propagate up */
+                        break;
+                }
+        }
+        /* unpin current right page */
+        XT_PUTPAGE(rmp);
+        return 0;
+}
+/*
+ *      xtSplitPage()
+ *
+ * function:
+ *      split a full non-root page into
+ *      original/split/left page and new right page
+ *      i.e., the original/split page remains as left page.
+ *
+ * parameter:
+ *      int             tid,
+ *      struct inode    *ip,
+ *      struct xtsplit  *split,
+ *      struct metapage **rmpp,
+ *      u64             *rbnp,
+ *
+ * return:
+ *      Pointer to page in which to insert or NULL on error.
+ */
+static int
+xtSplitPage(tid_t tid, struct inode *ip,
+            struct xtsplit * split, struct metapage ** rmpp, s64 * rbnp)
+{
+        int rc = 0;
+        struct metapage *smp;
+        xtpage_t *sp;
+        struct metapage *rmp;
+        xtpage_t *rp;           /* new right page allocated */
+        s64 rbn;                /* new right page block number */
+        struct metapage *mp;
+        xtpage_t *p;
+        s64 nextbn;
+        int skip, maxentry, middle, righthalf, n;
+        xad_t *xad;
+        struct pxdlist *pxdlist;
+        pxd_t *pxd;
+        struct tlock *tlck;
+        struct xtlock *sxtlck = NULL, *rxtlck = NULL;
+        int quota_allocation = 0;
+        smp = split->mp;
+        sp = XT_PAGE(ip, smp);
+        INCREMENT(xtStat.split);
+        pxdlist = split->pxdlist;
+        pxd = &pxdlist->pxd[pxdlist->npxd];
+        pxdlist->npxd++;
+        rbn = addressPXD(pxd);
+        /* Allocate blocks to quota. */
+       if (DQUOT_ALLOC_BLOCK(ip, lengthPXD(pxd))) {
+               rc = -EDQUOT;
+               goto clean_up;
+        }
+        quota_allocation += lengthPXD(pxd);
+        /*
+         * allocate the new right page for the split
+         */
+        rmp = get_metapage(ip, rbn, PSIZE, 1);
+        if (rmp == NULL) {
+                rc = -EIO;
+                goto clean_up;
+        }
+        jfs_info("xtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp);
+        BT_MARK_DIRTY(rmp, ip);
+        /*
+         * action: new page;
+         */
+        rp = (xtpage_t *) rmp->data;
+        rp->header.self = *pxd;
+        rp->header.flag = sp->header.flag & BT_TYPE;
+        rp->header.maxentry = sp->header.maxentry;      /* little-endian */
+        rp->header.nextindex = cpu_to_le16(XTENTRYSTART);
+        BT_MARK_DIRTY(smp, ip);
+        /* Don't log it if there are no links to the file */
+        if (!test_cflag(COMMIT_Nolink, ip)) {
+                /*
+                 * acquire a transaction lock on the new right page;
+                 */
+                tlck = txLock(tid, ip, rmp, tlckXTREE | tlckNEW);
+                rxtlck = (struct xtlock *) & tlck->lock;
+                rxtlck->lwm.offset = XTENTRYSTART;
+                /*
+                 * acquire a transaction lock on the split page
+                 */
+                tlck = txLock(tid, ip, smp, tlckXTREE | tlckGROW);
+                sxtlck = (struct xtlock *) & tlck->lock;
+        }
+        /*
+         * initialize/update sibling pointers of <sp> and <rp>
+         */
+        nextbn = le64_to_cpu(sp->header.next);
+        rp->header.next = cpu_to_le64(nextbn);
+        rp->header.prev = cpu_to_le64(addressPXD(&sp->header.self));
+        sp->header.next = cpu_to_le64(rbn);
+        skip = split->index;
+        /*
+         *      sequential append at tail (after last entry of last page)
+         *
+         * if splitting the last page on a level because of appending
+         * a entry to it (skip is maxentry), it's likely that the access is
+         * sequential. adding an empty page on the side of the level is less
+         * work and can push the fill factor much higher than normal.
+         * if we're wrong it's no big deal -  we will do the split the right
+         * way next time.
+         * (it may look like it's equally easy to do a similar hack for
+         * reverse sorted data, that is, split the tree left, but it's not.
+         * Be my guest.)
+         */
+        if (nextbn == 0 && skip == le16_to_cpu(sp->header.maxentry)) {
+                /*
+                 * acquire a transaction lock on the new/right page;
+                 *
+                 * action: xad insertion;
+                 */
+                /* insert entry at the first entry of the new right page */
+                xad = &rp->xad[XTENTRYSTART];
+                XT_PUTENTRY(xad, split->flag, split->off, split->len,
+                            split->addr);
+                rp->header.nextindex = cpu_to_le16(XTENTRYSTART + 1);
+                if (!test_cflag(COMMIT_Nolink, ip)) {
+                        /* rxtlck->lwm.offset = XTENTRYSTART; */
+                        rxtlck->lwm.length = 1;
+                }
+                *rmpp = rmp;
+                *rbnp = rbn;
+                jfs_info("xtSplitPage: sp:0x%p rp:0x%p", sp, rp);
+                return 0;
+        }
+        /*
+         *      non-sequential insert (at possibly middle page)
+         */
+        /*
+         * update previous pointer of old next/right page of <sp>
+         */
+        if (nextbn != 0) {
+                XT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc);
+                if (rc) {
+                        XT_PUTPAGE(rmp);
+                        goto clean_up;
+                }
+                BT_MARK_DIRTY(mp, ip);
+                /*
+                 * acquire a transaction lock on the next page;
+                 *
+                 * action:sibling pointer update;
+                 */
+                if (!test_cflag(COMMIT_Nolink, ip))
+                        tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK);
+                p->header.prev = cpu_to_le64(rbn);
+                /* sibling page may have been updated previously, or
+                 * it may be updated later;
+                 */
+                XT_PUTPAGE(mp);
+        }
+        /*
+         * split the data between the split and new/right pages
+         */
+        maxentry = le16_to_cpu(sp->header.maxentry);
+        middle = maxentry >> 1;
+        righthalf = maxentry - middle;
+        /*
+         * skip index in old split/left page - insert into left page:
+         */
+        if (skip <= middle) {
+                /* move right half of split page to the new right page */
+                memmove(&rp->xad[XTENTRYSTART], &sp->xad[middle],
+                        righthalf << L2XTSLOTSIZE);
+                /* shift right tail of left half to make room for new entry */
+                if (skip < middle)
+                        memmove(&sp->xad[skip + 1], &sp->xad[skip],
+                                (middle - skip) << L2XTSLOTSIZE);
+                /* insert new entry */
+                xad = &sp->xad[skip];
+                XT_PUTENTRY(xad, split->flag, split->off, split->len,
+                            split->addr);
+                /* update page header */
+                sp->header.nextindex = cpu_to_le16(middle + 1);
+                if (!test_cflag(COMMIT_Nolink, ip)) {
+                        sxtlck->lwm.offset = (sxtlck->lwm.offset) ?
+                            min(skip, (int)sxtlck->lwm.offset) : skip;
+                }
+                rp->header.nextindex =
+                    cpu_to_le16(XTENTRYSTART + righthalf);
+        }
+        /*
+         * skip index in new right page - insert into right page:
+         */
+        else {
+                /* move left head of right half to right page */
+                n = skip - middle;
+                memmove(&rp->xad[XTENTRYSTART], &sp->xad[middle],
+                        n << L2XTSLOTSIZE);
+                /* insert new entry */
+                n += XTENTRYSTART;
+                xad = &rp->xad[n];
+                XT_PUTENTRY(xad, split->flag, split->off, split->len,
+                            split->addr);
+                /* move right tail of right half to right page */
+                if (skip < maxentry)
+                        memmove(&rp->xad[n + 1], &sp->xad[skip],
+                                (maxentry - skip) << L2XTSLOTSIZE);
+                /* update page header */
+                sp->header.nextindex = cpu_to_le16(middle);
+                if (!test_cflag(COMMIT_Nolink, ip)) {
+                        sxtlck->lwm.offset = (sxtlck->lwm.offset) ?
+                            min(middle, (int)sxtlck->lwm.offset) : middle;
+                }
+                rp->header.nextindex = cpu_to_le16(XTENTRYSTART +
+                                                   righthalf + 1);
+        }
+        if (!test_cflag(COMMIT_Nolink, ip)) {
+                sxtlck->lwm.length = le16_to_cpu(sp->header.nextindex) -
+                    sxtlck->lwm.offset;
+                /* rxtlck->lwm.offset = XTENTRYSTART; */
+                rxtlck->lwm.length = le16_to_cpu(rp->header.nextindex) -
+                    XTENTRYSTART;
+        }
+        *rmpp = rmp;
+        *rbnp = rbn;
+        jfs_info("xtSplitPage: sp:0x%p rp:0x%p", sp, rp);
+        return rc;
+      clean_up:
+        /* Rollback quota allocation. */
+        if (quota_allocation)
+                DQUOT_FREE_BLOCK(ip, quota_allocation);
+        return (rc);
+}
+/*
+ *      xtSplitRoot()
+ *
+ * function:
+ *      split the full root page into
+ *      original/root/split page and new right page
+ *      i.e., root remains fixed in tree anchor (inode) and
+ *      the root is copied to a single new right child page
+ *      since root page << non-root page, and
+ *      the split root page contains a single entry for the
+ *      new right child page.
+ *
+ * parameter:
+ *      int             tid,
+ *      struct inode    *ip,
+ *      struct xtsplit  *split,
+ *      struct metapage **rmpp)
+ *
+ * return:
+ *      Pointer to page in which to insert or NULL on error.
+ */
+static int
+xtSplitRoot(tid_t tid,
+            struct inode *ip, struct xtsplit * split, struct metapage ** rmpp)
+{
+        xtpage_t *sp;
+        struct metapage *rmp;
+        xtpage_t *rp;
+        s64 rbn;
+        int skip, nextindex;
+        xad_t *xad;
+        pxd_t *pxd;
+        struct pxdlist *pxdlist;
+        struct tlock *tlck;
+        struct xtlock *xtlck;
+        sp = &JFS_IP(ip)->i_xtroot;
+        INCREMENT(xtStat.split);
+        /*
+         *      allocate a single (right) child page
+         */
+        pxdlist = split->pxdlist;
+        pxd = &pxdlist->pxd[pxdlist->npxd];
+        pxdlist->npxd++;
+        rbn = addressPXD(pxd);
+        rmp = get_metapage(ip, rbn, PSIZE, 1);
+        if (rmp == NULL)
+                return -EIO;
+        /* Allocate blocks to quota. */
+        if (DQUOT_ALLOC_BLOCK(ip, lengthPXD(pxd))) {
+                release_metapage(rmp);
+                return -EDQUOT;
+        }
+        jfs_info("xtSplitRoot: ip:0x%p rmp:0x%p", ip, rmp);
+        /*
+         * acquire a transaction lock on the new right page;
+         *
+         * action: new page;
+         */
+        BT_MARK_DIRTY(rmp, ip);
+        rp = (xtpage_t *) rmp->data;
+        rp->header.flag =
+            (sp->header.flag & BT_LEAF) ? BT_LEAF : BT_INTERNAL;
+        rp->header.self = *pxd;
+        rp->header.nextindex = cpu_to_le16(XTENTRYSTART);
+        rp->header.maxentry = cpu_to_le16(PSIZE >> L2XTSLOTSIZE);
+        /* initialize sibling pointers */
+        rp->header.next = 0;
+        rp->header.prev = 0;
+        /*
+         * copy the in-line root page into new right page extent
+         */
+        nextindex = le16_to_cpu(sp->header.maxentry);
+        memmove(&rp->xad[XTENTRYSTART], &sp->xad[XTENTRYSTART],
+                (nextindex - XTENTRYSTART) << L2XTSLOTSIZE);
+        /*
+         * insert the new entry into the new right/child page
+         * (skip index in the new right page will not change)
+         */
+        skip = split->index;
+        /* if insert into middle, shift right remaining entries */
+        if (skip != nextindex)
+                memmove(&rp->xad[skip + 1], &rp->xad[skip],
+                        (nextindex - skip) * sizeof(xad_t));
+        xad = &rp->xad[skip];
+        XT_PUTENTRY(xad, split->flag, split->off, split->len, split->addr);
+        /* update page header */
+        rp->header.nextindex = cpu_to_le16(nextindex + 1);
+        if (!test_cflag(COMMIT_Nolink, ip)) {
+                tlck = txLock(tid, ip, rmp, tlckXTREE | tlckNEW);
+                xtlck = (struct xtlock *) & tlck->lock;
+                xtlck->lwm.offset = XTENTRYSTART;
+                xtlck->lwm.length = le16_to_cpu(rp->header.nextindex) -
+                    XTENTRYSTART;
+        }
+        /*
+         *      reset the root
+         *
+         * init root with the single entry for the new right page
+         * set the 1st entry offset to 0, which force the left-most key
+         * at any level of the tree to be less than any search key.
+         */
+        /*
+         * acquire a transaction lock on the root page (in-memory inode);
+         *
+         * action: root split;
+         */
+        BT_MARK_DIRTY(split->mp, ip);
+        xad = &sp->xad[XTENTRYSTART];
+        XT_PUTENTRY(xad, XAD_NEW, 0, JFS_SBI(ip->i_sb)->nbperpage, rbn);
+        /* update page header of root */
+        sp->header.flag &= ~BT_LEAF;
+        sp->header.flag |= BT_INTERNAL;
+        sp->header.nextindex = cpu_to_le16(XTENTRYSTART + 1);
+        if (!test_cflag(COMMIT_Nolink, ip)) {
+                tlck = txLock(tid, ip, split->mp, tlckXTREE | tlckGROW);
+                xtlck = (struct xtlock *) & tlck->lock;
+                xtlck->lwm.offset = XTENTRYSTART;
+                xtlck->lwm.length = 1;
+        }
+        *rmpp = rmp;
+        jfs_info("xtSplitRoot: sp:0x%p rp:0x%p", sp, rp);
+        return 0;
+}
+/*
+ *      xtExtend()
+ *
+ * function: extend in-place;
+ *
+ * note: existing extent may or may not have been committed.
+ * caller is responsible for pager buffer cache update, and
+ * working block allocation map update;
+ * update pmap: alloc whole extended extent;
+ */
+int xtExtend(tid_t tid,         /* transaction id */
+             struct inode *ip, s64 xoff,        /* delta extent offset */
+             s32 xlen,          /* delta extent length */
+             int flag)
+{
+        int rc = 0;
+        int cmp;
+        struct metapage *mp;    /* meta-page buffer */
+        xtpage_t *p;            /* base B+-tree index page */
+        s64 bn;
+        int index, nextindex, len;
+        struct btstack btstack; /* traverse stack */
+        struct xtsplit split;   /* split information */
+        xad_t *xad;
+        s64 xaddr;
+        struct tlock *tlck;
+        struct xtlock *xtlck = NULL;
+        jfs_info("xtExtend: nxoff:0x%lx nxlen:0x%x", (ulong) xoff, xlen);
+        /* there must exist extent to be extended */
+        if ((rc = xtSearch(ip, xoff - 1, &cmp, &btstack, XT_INSERT)))
+                return rc;
+        /* retrieve search result */
+        XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
+        if (cmp != 0) {
+                XT_PUTPAGE(mp);
+                jfs_error(ip->i_sb, "xtExtend: xtSearch did not find extent");
+                return -EIO;
+        }
+        /* extension must be contiguous */
+        xad = &p->xad[index];
+        if ((offsetXAD(xad) + lengthXAD(xad)) != xoff) {
+                XT_PUTPAGE(mp);
+                jfs_error(ip->i_sb, "xtExtend: extension is not contiguous");
+                return -EIO;
+        }
+        /*
+         * acquire a transaction lock on the leaf page;
+         *
+         * action: xad insertion/extension;
+         */
+        BT_MARK_DIRTY(mp, ip);
+        if (!test_cflag(COMMIT_Nolink, ip)) {
+                tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
+                xtlck = (struct xtlock *) & tlck->lock;
+        }
+        /* extend will overflow extent ? */
+        xlen = lengthXAD(xad) + xlen;
+        if ((len = xlen - MAXXLEN) <= 0)
+                goto extendOld;
+        /*
+         *      extent overflow: insert entry for new extent
+         */
+//insertNew:
+        xoff = offsetXAD(xad) + MAXXLEN;
+        xaddr = addressXAD(xad) + MAXXLEN;
+        nextindex = le16_to_cpu(p->header.nextindex);
+        /*
+         *      if the leaf page is full, insert the new entry and
+         *      propagate up the router entry for the new page from split
+         *
+         * The xtSplitUp() will insert the entry and unpin the leaf page.
+         */
+        if (nextindex == le16_to_cpu(p->header.maxentry)) {
+                /* xtSpliUp() unpins leaf pages */
+                split.mp = mp;
+                split.index = index + 1;
+                split.flag = XAD_NEW;
+                split.off = xoff;       /* split offset */
+                split.len = len;
+                split.addr = xaddr;
+                split.pxdlist = NULL;
+                if ((rc = xtSplitUp(tid, ip, &split, &btstack)))
+                        return rc;
+                /* get back old page */
+                XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+                if (rc)
+                        return rc;
+                /*
+                 * if leaf root has been split, original root has been
+                 * copied to new child page, i.e., original entry now
+                 * resides on the new child page;
+                 */
+                if (p->header.flag & BT_INTERNAL) {
+                        ASSERT(p->header.nextindex ==
+                               cpu_to_le16(XTENTRYSTART + 1));
+                        xad = &p->xad[XTENTRYSTART];
+                        bn = addressXAD(xad);
+                        XT_PUTPAGE(mp);
+                        /* get new child page */
+                        XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+                        if (rc)
+                                return rc;
+                        BT_MARK_DIRTY(mp, ip);
+                        if (!test_cflag(COMMIT_Nolink, ip)) {
+                                tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW);
+                                xtlck = (struct xtlock *) & tlck->lock;
+                        }
+                }
+        }
+        /*
+         *      insert the new entry into the leaf page
+         */
+        else {
+                /* insert the new entry: mark the entry NEW */
+                xad = &p->xad[index + 1];
+                XT_PUTENTRY(xad, XAD_NEW, xoff, len, xaddr);
+                /* advance next available entry index */
+                p->header.nextindex =
+                    cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
+        }
+        /* get back old entry */
+        xad = &p->xad[index];
+        xlen = MAXXLEN;
+        /*
+         * extend old extent
+         */
+      extendOld:
+        XADlength(xad, xlen);
+        if (!(xad->flag & XAD_NEW))
+                xad->flag |= XAD_EXTENDED;
+        if (!test_cflag(COMMIT_Nolink, ip)) {
+                xtlck->lwm.offset =
+                    (xtlck->lwm.offset) ? min(index,
+                                              (int)xtlck->lwm.offset) : index;
+                xtlck->lwm.length =
+                    le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset;
+        }
+        /* unpin the leaf page */
+        XT_PUTPAGE(mp);
+        return rc;
+}
+#ifdef _NOTYET
+/*
+ *      xtTailgate()
+ *
+ * function: split existing 'tail' extent
+ *      (split offset >= start offset of tail extent), and
+ *      relocate and extend the split tail half;
+ *
+ * note: existing extent may or may not have been committed.
+ * caller is responsible for pager buffer cache update, and
+ * working block allocation map update;
+ * update pmap: free old split tail extent, alloc new extent;
+ */
+int xtTailgate(tid_t tid,               /* transaction id */
+               struct inode *ip, s64 xoff,      /* split/new extent offset */
+               s32 xlen,        /* new extent length */
+               s64 xaddr,       /* new extent address */
+               int flag)
+{
+        int rc = 0;
+        int cmp;
+        struct metapage *mp;    /* meta-page buffer */
+        xtpage_t *p;            /* base B+-tree index page */
+        s64 bn;
+        int index, nextindex, llen, rlen;
+        struct btstack btstack; /* traverse stack */
+        struct xtsplit split;   /* split information */
+        xad_t *xad;
+        struct tlock *tlck;
+        struct xtlock *xtlck = 0;
+        struct tlock *mtlck;
+        struct maplock *pxdlock;
+/*
+printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n",
+        (ulong)xoff, xlen, (ulong)xaddr);
+*/
+        /* there must exist extent to be tailgated */
+        if ((rc = xtSearch(ip, xoff, &cmp, &btstack, XT_INSERT)))
+                return rc;
+        /* retrieve search result */
+        XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
+        if (cmp != 0) {
+                XT_PUTPAGE(mp);
+                jfs_error(ip->i_sb, "xtTailgate: couldn't find extent");
+                return -EIO;
+        }
+        /* entry found must be last entry */
+        nextindex = le16_to_cpu(p->header.nextindex);
+        if (index != nextindex - 1) {
+                XT_PUTPAGE(mp);
+                jfs_error(ip->i_sb,
+                          "xtTailgate: the entry found is not the last entry");
+                return -EIO;
+        }
+        BT_MARK_DIRTY(mp, ip);
+        /*
+         * acquire tlock of the leaf page containing original entry
+         */
+        if (!test_cflag(COMMIT_Nolink, ip)) {
+                tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
+                xtlck = (struct xtlock *) & tlck->lock;
+        }
+        /* completely replace extent ? */
+        xad = &p->xad[index];
+/*
+printf("xtTailgate: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n",
+        (ulong)offsetXAD(xad), lengthXAD(xad), (ulong)addressXAD(xad));
+*/
+        if ((llen = xoff - offsetXAD(xad)) == 0)
+                goto updateOld;
+        /*
+         *      partially replace extent: insert entry for new extent
+         */
+//insertNew:
+        /*
+         *      if the leaf page is full, insert the new entry and
+         *      propagate up the router entry for the new page from split
+         *
+         * The xtSplitUp() will insert the entry and unpin the leaf page.
+         */
+        if (nextindex == le16_to_cpu(p->header.maxentry)) {
+                /* xtSpliUp() unpins leaf pages */
+                split.mp = mp;
+                split.index = index + 1;
+                split.flag = XAD_NEW;
+                split.off = xoff;       /* split offset */
+                split.len = xlen;
+                split.addr = xaddr;
+                split.pxdlist = NULL;
+                if ((rc = xtSplitUp(tid, ip, &split, &btstack)))
+                        return rc;
+                /* get back old page */
+                XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+                if (rc)
+                        return rc;
+                /*
+                 * if leaf root has been split, original root has been
+                 * copied to new child page, i.e., original entry now
+                 * resides on the new child page;
+                 */
+                if (p->header.flag & BT_INTERNAL) {
+                        ASSERT(p->header.nextindex ==
+                               cpu_to_le16(XTENTRYSTART + 1));
+                        xad = &p->xad[XTENTRYSTART];
+                        bn = addressXAD(xad);
+                        XT_PUTPAGE(mp);
+                        /* get new child page */
+                        XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+                        if (rc)
+                                return rc;
+                        BT_MARK_DIRTY(mp, ip);
+                        if (!test_cflag(COMMIT_Nolink, ip)) {
+                                tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW);
+                                xtlck = (struct xtlock *) & tlck->lock;
+                        }
+                }
+        }
+        /*
+         *      insert the new entry into the leaf page
+         */
+        else {
+                /* insert the new entry: mark the entry NEW */
+                xad = &p->xad[index + 1];
+                XT_PUTENTRY(xad, XAD_NEW, xoff, xlen, xaddr);
+                /* advance next available entry index */
+                p->header.nextindex =
+                    cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
+        }
+        /* get back old XAD */
+        xad = &p->xad[index];
+        /*
+         * truncate/relocate old extent at split offset
+         */
+      updateOld:
+        /* update dmap for old/committed/truncated extent */
+        rlen = lengthXAD(xad) - llen;
+        if (!(xad->flag & XAD_NEW)) {
+                /* free from PWMAP at commit */
+                if (!test_cflag(COMMIT_Nolink, ip)) {
+                        mtlck = txMaplock(tid, ip, tlckMAP);
+                        pxdlock = (struct maplock *) & mtlck->lock;
+                        pxdlock->flag = mlckFREEPXD;
+                        PXDaddress(&pxdlock->pxd, addressXAD(xad) + llen);
+                        PXDlength(&pxdlock->pxd, rlen);
+                        pxdlock->index = 1;
+                }
+        } else
+                /* free from WMAP */
+                dbFree(ip, addressXAD(xad) + llen, (s64) rlen);
+        if (llen)
+                /* truncate */
+                XADlength(xad, llen);
+        else
+                /* replace */
+                XT_PUTENTRY(xad, XAD_NEW, xoff, xlen, xaddr);
+        if (!test_cflag(COMMIT_Nolink, ip)) {
+                xtlck->lwm.offset = (xtlck->lwm.offset) ?
+                    min(index, (int)xtlck->lwm.offset) : index;
+                xtlck->lwm.length = le16_to_cpu(p->header.nextindex) -
+                    xtlck->lwm.offset;
+        }
+        /* unpin the leaf page */
+        XT_PUTPAGE(mp);
+        return rc;
+}
+#endif /* _NOTYET */
+/*
+ *      xtUpdate()
+ *
+ * function: update XAD;
+ *
+ *      update extent for allocated_but_not_recorded or
+ *      compressed extent;
+ *
+ * parameter:
+ *      nxad    - new XAD;
+ *                logical extent of the specified XAD must be completely
+ *                contained by an existing XAD;
+ */
+int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
+{                               /* new XAD */
+        int rc = 0;
+        int cmp;
+        struct metapage *mp;    /* meta-page buffer */
+        xtpage_t *p;            /* base B+-tree index page */
+        s64 bn;
+        int index0, index, newindex, nextindex;
+        struct btstack btstack; /* traverse stack */
+        struct xtsplit split;   /* split information */
+        xad_t *xad, *lxad, *rxad;
+        int xflag;
+        s64 nxoff, xoff;
+        int nxlen, xlen, lxlen, rxlen;
+        s64 nxaddr, xaddr;
+        struct tlock *tlck;
+        struct xtlock *xtlck = NULL;
+        int newpage = 0;
+        /* there must exist extent to be tailgated */
+        nxoff = offsetXAD(nxad);
+        nxlen = lengthXAD(nxad);
+        nxaddr = addressXAD(nxad);
+        if ((rc = xtSearch(ip, nxoff, &cmp, &btstack, XT_INSERT)))
+                return rc;
+        /* retrieve search result */
+        XT_GETSEARCH(ip, btstack.top, bn, mp, p, index0);
+        if (cmp != 0) {
+                XT_PUTPAGE(mp);
+                jfs_error(ip->i_sb, "xtUpdate: Could not find extent");
+                return -EIO;
+        }
+        BT_MARK_DIRTY(mp, ip);
+        /*
+         * acquire tlock of the leaf page containing original entry
+         */
+        if (!test_cflag(COMMIT_Nolink, ip)) {
+                tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
+                xtlck = (struct xtlock *) & tlck->lock;
+        }
+        xad = &p->xad[index0];
+        xflag = xad->flag;
+        xoff = offsetXAD(xad);
+        xlen = lengthXAD(xad);
+        xaddr = addressXAD(xad);
+        /* nXAD must be completely contained within XAD */
+        if ((xoff > nxoff) ||
+            (nxoff + nxlen > xoff + xlen)) {
+                XT_PUTPAGE(mp);
+                jfs_error(ip->i_sb,
+                          "xtUpdate: nXAD in not completely contained within XAD");
+                return -EIO;
+        }
+        index = index0;
+        newindex = index + 1;
+        nextindex = le16_to_cpu(p->header.nextindex);
+#ifdef  _JFS_WIP_NOCOALESCE
+        if (xoff < nxoff)
+                goto updateRight;
+        /*
+         * replace XAD with nXAD
+         */
+      replace:                  /* (nxoff == xoff) */
+        if (nxlen == xlen) {
+                /* replace XAD with nXAD:recorded */
+                *xad = *nxad;
+                xad->flag = xflag & ~XAD_NOTRECORDED;
+                goto out;
+        } else                  /* (nxlen < xlen) */
+                goto updateLeft;
+#endif                          /* _JFS_WIP_NOCOALESCE */
+/* #ifdef _JFS_WIP_COALESCE */
+        if (xoff < nxoff)
+                goto coalesceRight;
+        /*
+         * coalesce with left XAD
+         */
+//coalesceLeft: /* (xoff == nxoff) */
+        /* is XAD first entry of page ? */
+        if (index == XTENTRYSTART)
+                goto replace;
+        /* is nXAD logically and physically contiguous with lXAD ? */
+        lxad = &p->xad[index - 1];
+        lxlen = lengthXAD(lxad);
+        if (!(lxad->flag & XAD_NOTRECORDED) &&
+            (nxoff == offsetXAD(lxad) + lxlen) &&
+            (nxaddr == addressXAD(lxad) + lxlen) &&
+            (lxlen + nxlen < MAXXLEN)) {
+                /* extend right lXAD */
+                index0 = index - 1;
+                XADlength(lxad, lxlen + nxlen);
+                /* If we just merged two extents together, need to make sure the
+                 * right extent gets logged.  If the left one is marked XAD_NEW,
+                 * then we know it will be logged.  Otherwise, mark as
+                 * XAD_EXTENDED
+                 */
+                if (!(lxad->flag & XAD_NEW))
+                        lxad->flag |= XAD_EXTENDED;
+                if (xlen > nxlen) {
+                        /* truncate XAD */
+                        XADoffset(xad, xoff + nxlen);
+                        XADlength(xad, xlen - nxlen);
+                        XADaddress(xad, xaddr + nxlen);
+                        goto out;
+                } else {        /* (xlen == nxlen) */
+                        /* remove XAD */
+                        if (index < nextindex - 1)
+                                memmove(&p->xad[index], &p->xad[index + 1],
+                                        (nextindex - index -
+                                         1) << L2XTSLOTSIZE);
+                        p->header.nextindex =
+                            cpu_to_le16(le16_to_cpu(p->header.nextindex) -
+                                        1);
+                        index = index0;
+                        newindex = index + 1;
+                        nextindex = le16_to_cpu(p->header.nextindex);
+                        xoff = nxoff = offsetXAD(lxad);
+                        xlen = nxlen = lxlen + nxlen;
+                        xaddr = nxaddr = addressXAD(lxad);
+                        goto coalesceRight;
+                }
+        }
+        /*
+         * replace XAD with nXAD
+         */
+      replace:                  /* (nxoff == xoff) */
+        if (nxlen == xlen) {
+                /* replace XAD with nXAD:recorded */
+                *xad = *nxad;
+                xad->flag = xflag & ~XAD_NOTRECORDED;
+                goto coalesceRight;
+        } else                  /* (nxlen < xlen) */
+                goto updateLeft;
+        /*
+         * coalesce with right XAD
+         */
+      coalesceRight:            /* (xoff <= nxoff) */
+        /* is XAD last entry of page ? */
+        if (newindex == nextindex) {
+                if (xoff == nxoff)
+                        goto out;
+                goto updateRight;
+        }
+        /* is nXAD logically and physically contiguous with rXAD ? */
+        rxad = &p->xad[index + 1];
+        rxlen = lengthXAD(rxad);
+        if (!(rxad->flag & XAD_NOTRECORDED) &&
+            (nxoff + nxlen == offsetXAD(rxad)) &&
+            (nxaddr + nxlen == addressXAD(rxad)) &&
+            (rxlen + nxlen < MAXXLEN)) {
+                /* extend left rXAD */
+                XADoffset(rxad, nxoff);
+                XADlength(rxad, rxlen + nxlen);
+                XADaddress(rxad, nxaddr);
+                /* If we just merged two extents together, need to make sure
+                 * the left extent gets logged.  If the right one is marked
+                 * XAD_NEW, then we know it will be logged.  Otherwise, mark as
+                 * XAD_EXTENDED
+                 */
+                if (!(rxad->flag & XAD_NEW))
+                        rxad->flag |= XAD_EXTENDED;
+                if (xlen > nxlen)
+                        /* truncate XAD */
+                        XADlength(xad, xlen - nxlen);
+                else {          /* (xlen == nxlen) */
+                        /* remove XAD */
+                        memmove(&p->xad[index], &p->xad[index + 1],
+                                (nextindex - index - 1) << L2XTSLOTSIZE);
+                        p->header.nextindex =
+                            cpu_to_le16(le16_to_cpu(p->header.nextindex) -
+                                        1);
+                }
+                goto out;
+        } else if (xoff == nxoff)
+                goto out;
+        if (xoff >= nxoff) {
+                XT_PUTPAGE(mp);
+                jfs_error(ip->i_sb, "xtUpdate: xoff >= nxoff");
+                return -EIO;
+        }
+/* #endif _JFS_WIP_COALESCE */
+        /*
+         * split XAD into (lXAD, nXAD):
+         *
+         *          |---nXAD--->
+         * --|----------XAD----------|--
+         *   |-lXAD-|
+         */
+      updateRight:              /* (xoff < nxoff) */
+        /* truncate old XAD as lXAD:not_recorded */
+        xad = &p->xad[index];
+        XADlength(xad, nxoff - xoff);
+        /* insert nXAD:recorded */
+        if (nextindex == le16_to_cpu(p->header.maxentry)) {
+                /* xtSpliUp() unpins leaf pages */
+                split.mp = mp;
+                split.index = newindex;
+                split.flag = xflag & ~XAD_NOTRECORDED;
+                split.off = nxoff;
+                split.len = nxlen;
+                split.addr = nxaddr;
+                split.pxdlist = NULL;
+                if ((rc = xtSplitUp(tid, ip, &split, &btstack)))
+                        return rc;
+                /* get back old page */
+                XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+                if (rc)
+                        return rc;
+                /*
+                 * if leaf root has been split, original root has been
+                 * copied to new child page, i.e., original entry now
+                 * resides on the new child page;
+                 */
+                if (p->header.flag & BT_INTERNAL) {
+                        ASSERT(p->header.nextindex ==
+                               cpu_to_le16(XTENTRYSTART + 1));
+                        xad = &p->xad[XTENTRYSTART];
+                        bn = addressXAD(xad);
+                        XT_PUTPAGE(mp);
+                        /* get new child page */
+                        XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+                        if (rc)
+                                return rc;
+                        BT_MARK_DIRTY(mp, ip);
+                        if (!test_cflag(COMMIT_Nolink, ip)) {
+                                tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW);
+                                xtlck = (struct xtlock *) & tlck->lock;
+                        }
+                } else {
+                        /* is nXAD on new page ? */
+                        if (newindex >
+                            (le16_to_cpu(p->header.maxentry) >> 1)) {
+                                newindex =
+                                    newindex -
+                                    le16_to_cpu(p->header.nextindex) +
+                                    XTENTRYSTART;
+                                newpage = 1;
+                        }
+                }
+        } else {
+                /* if insert into middle, shift right remaining entries */
+                if (newindex < nextindex)
+                        memmove(&p->xad[newindex + 1], &p->xad[newindex],
+                                (nextindex - newindex) << L2XTSLOTSIZE);
+                /* insert the entry */
+                xad = &p->xad[newindex];
+                *xad = *nxad;
+                xad->flag = xflag & ~XAD_NOTRECORDED;
+                /* advance next available entry index. */
+                p->header.nextindex =
+                    cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
+        }
+        /*
+         * does nXAD force 3-way split ?
+         *
+         *          |---nXAD--->|
+         * --|----------XAD-------------|--
+         *   |-lXAD-|           |-rXAD -|
+         */
+        if (nxoff + nxlen == xoff + xlen)
+                goto out;
+        /* reorient nXAD as XAD for further split XAD into (nXAD, rXAD) */
+        if (newpage) {
+                /* close out old page */
+                if (!test_cflag(COMMIT_Nolink, ip)) {
+                        xtlck->lwm.offset = (xtlck->lwm.offset) ?
+                            min(index0, (int)xtlck->lwm.offset) : index0;
+                        xtlck->lwm.length =
+                            le16_to_cpu(p->header.nextindex) -
+                            xtlck->lwm.offset;
+                }
+                bn = le64_to_cpu(p->header.next);
+                XT_PUTPAGE(mp);
+                /* get new right page */
+                XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+                if (rc)
+                        return rc;
+                BT_MARK_DIRTY(mp, ip);
+                if (!test_cflag(COMMIT_Nolink, ip)) {
+                        tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
+                        xtlck = (struct xtlock *) & tlck->lock;
+                }
+                index0 = index = newindex;
+        } else
+                index++;
+        newindex = index + 1;
+        nextindex = le16_to_cpu(p->header.nextindex);
+        xlen = xlen - (nxoff - xoff);
+        xoff = nxoff;
+        xaddr = nxaddr;
+        /* recompute split pages */
+        if (nextindex == le16_to_cpu(p->header.maxentry)) {
+                XT_PUTPAGE(mp);
+                if ((rc = xtSearch(ip, nxoff, &cmp, &btstack, XT_INSERT)))
+                        return rc;
+                /* retrieve search result */
+                XT_GETSEARCH(ip, btstack.top, bn, mp, p, index0);
+                if (cmp != 0) {
+                        XT_PUTPAGE(mp);
+                        jfs_error(ip->i_sb, "xtUpdate: xtSearch failed");
+                        return -EIO;
+                }
+                if (index0 != index) {
+                        XT_PUTPAGE(mp);
+                        jfs_error(ip->i_sb,
+                                  "xtUpdate: unexpected value of index");
+                        return -EIO;
+                }
+        }
+        /*
+         * split XAD into (nXAD, rXAD)
+         *
+         *          ---nXAD---|
+         * --|----------XAD----------|--
+         *                    |-rXAD-|
+         */
+      updateLeft:               /* (nxoff == xoff) && (nxlen < xlen) */
+        /* update old XAD with nXAD:recorded */
+        xad = &p->xad[index];
+        *xad = *nxad;
+        xad->flag = xflag & ~XAD_NOTRECORDED;
+        /* insert rXAD:not_recorded */
+        xoff = xoff + nxlen;
+        xlen = xlen - nxlen;
+        xaddr = xaddr + nxlen;
+        if (nextindex == le16_to_cpu(p->header.maxentry)) {
+/*
+printf("xtUpdate.updateLeft.split p:0x%p\n", p);
+*/
+                /* xtSpliUp() unpins leaf pages */
+                split.mp = mp;
+                split.index = newindex;
+                split.flag = xflag;
+                split.off = xoff;
+                split.len = xlen;
+                split.addr = xaddr;
+                split.pxdlist = NULL;
+                if ((rc = xtSplitUp(tid, ip, &split, &btstack)))
+                        return rc;
+                /* get back old page */
+                XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+                if (rc)
+                        return rc;
+                /*
+                 * if leaf root has been split, original root has been
+                 * copied to new child page, i.e., original entry now
+                 * resides on the new child page;
+                 */
+                if (p->header.flag & BT_INTERNAL) {
+                        ASSERT(p->header.nextindex ==
+                               cpu_to_le16(XTENTRYSTART + 1));
+                        xad = &p->xad[XTENTRYSTART];
+                        bn = addressXAD(xad);
+                        XT_PUTPAGE(mp);
+                        /* get new child page */
+                        XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+                        if (rc)
+                                return rc;
+                        BT_MARK_DIRTY(mp, ip);
+                        if (!test_cflag(COMMIT_Nolink, ip)) {
+                                tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW);
+                                xtlck = (struct xtlock *) & tlck->lock;
+                        }
+                }
+        } else {
+                /* if insert into middle, shift right remaining entries */
+                if (newindex < nextindex)
+                        memmove(&p->xad[newindex + 1], &p->xad[newindex],
+                                (nextindex - newindex) << L2XTSLOTSIZE);
+                /* insert the entry */
+                xad = &p->xad[newindex];
+                XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr);
+                /* advance next available entry index. */
+                p->header.nextindex =
+                    cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
+        }
+      out:
+        if (!test_cflag(COMMIT_Nolink, ip)) {
+                xtlck->lwm.offset = (xtlck->lwm.offset) ?
+                    min(index0, (int)xtlck->lwm.offset) : index0;
+                xtlck->lwm.length = le16_to_cpu(p->header.nextindex) -
+                    xtlck->lwm.offset;
+        }
+        /* unpin the leaf page */
+        XT_PUTPAGE(mp);
+        return rc;
+}
+/*
+ *      xtAppend()
+ *
+ * function: grow in append mode from contiguous region specified ;
+ *
+ * parameter:
+ *      tid             - transaction id;
+ *      ip              - file object;
+ *      xflag           - extent flag:
+ *      xoff            - extent offset;
+ *      maxblocks       - max extent length;
+ *      xlen            - extent length (in/out);
+ *      xaddrp          - extent address pointer (in/out):
+ *      flag            -
+ *
+ * return:
+ */
+int xtAppend(tid_t tid,         /* transaction id */
+             struct inode *ip, int xflag, s64 xoff, s32 maxblocks,      
+             s32 * xlenp,       /* (in/out) */
+             s64 * xaddrp,      /* (in/out) */
+             int flag)
+{
+        int rc = 0;
+        struct metapage *mp;    /* meta-page buffer */
+        xtpage_t *p;            /* base B+-tree index page */
+        s64 bn, xaddr;
+        int index, nextindex;
+        struct btstack btstack; /* traverse stack */
+        struct xtsplit split;   /* split information */
+        xad_t *xad;
+        int cmp;
+        struct tlock *tlck;
+        struct xtlock *xtlck;
+        int nsplit, nblocks, xlen;
+        struct pxdlist pxdlist;
+        pxd_t *pxd;
+        xaddr = *xaddrp;
+        xlen = *xlenp;
+        jfs_info("xtAppend: xoff:0x%lx maxblocks:%d xlen:%d xaddr:0x%lx",
+                 (ulong) xoff, maxblocks, xlen, (ulong) xaddr);
+        /*
+         *      search for the entry location at which to insert:
+         *
+         * xtFastSearch() and xtSearch() both returns (leaf page
+         * pinned, index at which to insert).
+         * n.b. xtSearch() may return index of maxentry of
+         * the full page.
+         */
+        if ((rc = xtSearch(ip, xoff, &cmp, &btstack, XT_INSERT)))
+                return rc;
+        /* retrieve search result */
+        XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
+        if (cmp == 0) {
+                rc = -EEXIST;
+                goto out;
+        }
+//insert:
+        /*
+         *      insert entry for new extent
+         */
+        xflag |= XAD_NEW;
+        /*
+         *      if the leaf page is full, split the page and
+         *      propagate up the router entry for the new page from split
+         *
+         * The xtSplitUp() will insert the entry and unpin the leaf page.
+         */
+        nextindex = le16_to_cpu(p->header.nextindex);
+        if (nextindex < le16_to_cpu(p->header.maxentry))
+                goto insertLeaf;
+        /*
+         * allocate new index blocks to cover index page split(s)
+         */
+        nsplit = btstack.nsplit;
+        split.pxdlist = &pxdlist;
+        pxdlist.maxnpxd = pxdlist.npxd = 0;
+        pxd = &pxdlist.pxd[0];
+        nblocks = JFS_SBI(ip->i_sb)->nbperpage;
+        for (; nsplit > 0; nsplit--, pxd++, xaddr += nblocks, maxblocks -= nblocks) {   
+                if ((rc = dbAllocBottomUp(ip, xaddr, (s64) nblocks)) == 0) {
+                        PXDaddress(pxd, xaddr);
+                        PXDlength(pxd, nblocks);
+                        pxdlist.maxnpxd++;
+                        continue;
+                }
+                /* undo allocation */
+                goto out;
+        }
+        xlen = min(xlen, maxblocks);    
+        /*
+         * allocate data extent requested
+         */
+        if ((rc = dbAllocBottomUp(ip, xaddr, (s64) xlen)))
+                goto out;
+        split.mp = mp;
+        split.index = index;
+        split.flag = xflag;
+        split.off = xoff;
+        split.len = xlen;
+        split.addr = xaddr;
+        if ((rc = xtSplitUp(tid, ip, &split, &btstack))) {
+                /* undo data extent allocation */
+                dbFree(ip, *xaddrp, (s64) * xlenp);
+                return rc;
+        }
+        *xaddrp = xaddr;
+        *xlenp = xlen;
+        return 0;
+        /*
+         *      insert the new entry into the leaf page
+         */
+      insertLeaf:
+        /*
+         * allocate data extent requested
+         */
+        if ((rc = dbAllocBottomUp(ip, xaddr, (s64) xlen)))
+                goto out;
+        BT_MARK_DIRTY(mp, ip);
+        /*
+         * acquire a transaction lock on the leaf page;
+         *
+         * action: xad insertion/extension;
+         */
+        tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
+        xtlck = (struct xtlock *) & tlck->lock;
+        /* insert the new entry: mark the entry NEW */
+        xad = &p->xad[index];
+        XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr);
+        /* advance next available entry index */
+        p->header.nextindex =
+            cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
+        xtlck->lwm.offset =
+            (xtlck->lwm.offset) ? min(index,(int) xtlck->lwm.offset) : index;
+        xtlck->lwm.length = le16_to_cpu(p->header.nextindex) -
+            xtlck->lwm.offset;
+        *xaddrp = xaddr;
+        *xlenp = xlen;
+      out:
+        /* unpin the leaf page */
+        XT_PUTPAGE(mp);
+        return rc;
+}
+#ifdef _STILL_TO_PORT
+/* - TBD for defragmentaion/reorganization -
+ *
+ *      xtDelete()
+ *
+ * function:
+ *      delete the entry with the specified key.
+ *
+ *      N.B.: whole extent of the entry is assumed to be deleted.
+ *
+ * parameter:
+ *
+ * return:
+ *       ENOENT: if the entry is not found.
+ *
+ * exception:
+ */
+int xtDelete(tid_t tid, struct inode *ip, s64 xoff, s32 xlen, int flag)
+{
+        int rc = 0;
+        struct btstack btstack;
+        int cmp;
+        s64 bn;
+        struct metapage *mp;
+        xtpage_t *p;
+        int index, nextindex;
+        struct tlock *tlck;
+        struct xtlock *xtlck;
+        /*
+         * find the matching entry; xtSearch() pins the page
+         */
+        if ((rc = xtSearch(ip, xoff, &cmp, &btstack, 0)))
+                return rc;
+        XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
+        if (cmp) {
+                /* unpin the leaf page */
+                XT_PUTPAGE(mp);
+                return -ENOENT;
+        }
+        /*
+         * delete the entry from the leaf page
+         */
+        nextindex = le16_to_cpu(p->header.nextindex);
+        p->header.nextindex =
+            cpu_to_le16(le16_to_cpu(p->header.nextindex) - 1);
+        /*
+         * if the leaf page bocome empty, free the page
+         */
+        if (p->header.nextindex == cpu_to_le16(XTENTRYSTART))
+                return (xtDeleteUp(tid, ip, mp, p, &btstack));
+        BT_MARK_DIRTY(mp, ip);
+        /*
+         * acquire a transaction lock on the leaf page;
+         *
+         * action:xad deletion;
+         */
+        tlck = txLock(tid, ip, mp, tlckXTREE);
+        xtlck = (struct xtlock *) & tlck->lock;
+        xtlck->lwm.offset =
+            (xtlck->lwm.offset) ? min(index, xtlck->lwm.offset) : index;
+        /* if delete from middle, shift left/compact the remaining entries */
+        if (index < nextindex - 1)
+                memmove(&p->xad[index], &p->xad[index + 1],
+                        (nextindex - index - 1) * sizeof(xad_t));
+        XT_PUTPAGE(mp);
+        return 0;
+}
+/* - TBD for defragmentaion/reorganization -
+ *
+ *      xtDeleteUp()
+ *
+ * function:
+ *      free empty pages as propagating deletion up the tree
+ *
+ * parameter:
+ *
+ * return:
+ */
+static int
+xtDeleteUp(tid_t tid, struct inode *ip,
+           struct metapage * fmp, xtpage_t * fp, struct btstack * btstack)
+{
+        int rc = 0;
+        struct metapage *mp;
+        xtpage_t *p;
+        int index, nextindex;
+        s64 xaddr;
+        int xlen;
+        struct btframe *parent;
+        struct tlock *tlck;
+        struct xtlock *xtlck;
+        /*
+         * keep root leaf page which has become empty
+         */
+        if (fp->header.flag & BT_ROOT) {
+                /* keep the root page */
+                fp->header.flag &= ~BT_INTERNAL;
+                fp->header.flag |= BT_LEAF;
+                fp->header.nextindex = cpu_to_le16(XTENTRYSTART);
+                /* XT_PUTPAGE(fmp); */
+                return 0;
+        }
+        /*
+         * free non-root leaf page
+         */
+        if ((rc = xtRelink(tid, ip, fp))) {
+                XT_PUTPAGE(fmp);
+                return rc;
+        }
+        xaddr = addressPXD(&fp->header.self);
+        xlen = lengthPXD(&fp->header.self);
+        /* free the page extent */
+        dbFree(ip, xaddr, (s64) xlen);
+        /* free the buffer page */
+        discard_metapage(fmp);
+        /*
+         * propagate page deletion up the index tree
+         *
+         * If the delete from the parent page makes it empty,
+         * continue all the way up the tree.
+         * stop if the root page is reached (which is never deleted) or
+         * if the entry deletion does not empty the page.
+         */
+        while ((parent = BT_POP(btstack)) != NULL) {
+                /* get/pin the parent page <sp> */
+                XT_GETPAGE(ip, parent->bn, mp, PSIZE, p, rc);
+                if (rc)
+                        return rc;
+                index = parent->index;
+                /* delete the entry for the freed child page from parent.
+                 */
+                nextindex = le16_to_cpu(p->header.nextindex);
+                /*
+                 * the parent has the single entry being deleted:
+                 * free the parent page which has become empty.
+                 */
+                if (nextindex == 1) {
+                        if (p->header.flag & BT_ROOT) {
+                                /* keep the root page */
+                                p->header.flag &= ~BT_INTERNAL;
+                                p->header.flag |= BT_LEAF;
+                                p->header.nextindex =
+                                    cpu_to_le16(XTENTRYSTART);
+                                /* XT_PUTPAGE(mp); */
+                                break;
+                        } else {
+                                /* free the parent page */
+                                if ((rc = xtRelink(tid, ip, p)))
+                                        return rc;
+                                xaddr = addressPXD(&p->header.self);
+                                /* free the page extent */
+                                dbFree(ip, xaddr,
+                                       (s64) JFS_SBI(ip->i_sb)->nbperpage);
+                                /* unpin/free the buffer page */
+                                discard_metapage(mp);
+                                /* propagate up */
+                                continue;
+                        }
+                }
+                /*
+                 * the parent has other entries remaining:
+                 * delete the router entry from the parent page.
+                 */
+                else {
+                        BT_MARK_DIRTY(mp, ip);
+                        /*
+                         * acquire a transaction lock on the leaf page;
+                         *
+                         * action:xad deletion;
+                         */
+                        tlck = txLock(tid, ip, mp, tlckXTREE);
+                        xtlck = (struct xtlock *) & tlck->lock;
+                        xtlck->lwm.offset =
+                            (xtlck->lwm.offset) ? min(index,
+                                                      xtlck->lwm.
+                                                      offset) : index;
+                        /* if delete from middle,
+                         * shift left/compact the remaining entries in the page
+                         */
+                        if (index < nextindex - 1)
+                                memmove(&p->xad[index], &p->xad[index + 1],
+                                        (nextindex - index -
+                                         1) << L2XTSLOTSIZE);
+                        p->header.nextindex =
+                            cpu_to_le16(le16_to_cpu(p->header.nextindex) -
+                                        1);
+                        jfs_info("xtDeleteUp(entry): 0x%lx[%d]",
+                                 (ulong) parent->bn, index);
+                }
+                /* unpin the parent page */
+                XT_PUTPAGE(mp);
+                /* exit propagation up */
+                break;
+        }
+        return 0;
+}
+/*
+ * NAME:        xtRelocate()
+ *
+ * FUNCTION:    relocate xtpage or data extent of regular file;
+ *              This function is mainly used by defragfs utility.
+ *
+ * NOTE:        This routine does not have the logic to handle
+ *              uncommitted allocated extent. The caller should call
+ *              txCommit() to commit all the allocation before call
+ *              this routine.
+ */
+int
+xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad,  /* old XAD */
+           s64 nxaddr,          /* new xaddr */
+           int xtype)
+{                               /* extent type: XTPAGE or DATAEXT */
+        int rc = 0;
+        struct tblock *tblk;
+        struct tlock *tlck;
+        struct xtlock *xtlck;
+        struct metapage *mp, *pmp, *lmp, *rmp;  /* meta-page buffer */
+        xtpage_t *p, *pp, *rp, *lp;     /* base B+-tree index page */
+        xad_t *xad;
+        pxd_t *pxd;
+        s64 xoff, xsize;
+        int xlen;
+        s64 oxaddr, sxaddr, dxaddr, nextbn, prevbn;
+        cbuf_t *cp;
+        s64 offset, nbytes, nbrd, pno;
+        int nb, npages, nblks;
+        s64 bn;
+        int cmp;
+        int index;
+        struct pxd_lock *pxdlock;
+        struct btstack btstack; /* traverse stack */
+        xtype = xtype & EXTENT_TYPE;
+        xoff = offsetXAD(oxad);
+        oxaddr = addressXAD(oxad);
+        xlen = lengthXAD(oxad);
+        /* validate extent offset */
+        offset = xoff << JFS_SBI(ip->i_sb)->l2bsize;
+        if (offset >= ip->i_size)
+                return -ESTALE; /* stale extent */
+        jfs_info("xtRelocate: xtype:%d xoff:0x%lx xlen:0x%x xaddr:0x%lx:0x%lx",
+                 xtype, (ulong) xoff, xlen, (ulong) oxaddr, (ulong) nxaddr);
+        /*
+         *      1. get and validate the parent xtpage/xad entry
+         *      covering the source extent to be relocated;
+         */
+        if (xtype == DATAEXT) {
+                /* search in leaf entry */
+                rc = xtSearch(ip, xoff, &cmp, &btstack, 0);
+                if (rc)
+                        return rc;
+                /* retrieve search result */
+                XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index);
+                if (cmp) {
+                        XT_PUTPAGE(pmp);
+                        return -ESTALE;
+                }
+                /* validate for exact match with a single entry */
+                xad = &pp->xad[index];
+                if (addressXAD(xad) != oxaddr || lengthXAD(xad) != xlen) {
+                        XT_PUTPAGE(pmp);
+                        return -ESTALE;
+                }
+        } else {                /* (xtype == XTPAGE) */
+                /* search in internal entry */
+                rc = xtSearchNode(ip, oxad, &cmp, &btstack, 0);
+                if (rc)
+                        return rc;
+                /* retrieve search result */
+                XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index);
+                if (cmp) {
+                        XT_PUTPAGE(pmp);
+                        return -ESTALE;
+                }
+                /* xtSearchNode() validated for exact match with a single entry
+                 */
+                xad = &pp->xad[index];
+        }
+        jfs_info("xtRelocate: parent xad entry validated.");
+        /*
+         *      2. relocate the extent
+         */
+        if (xtype == DATAEXT) {
+                /* if the extent is allocated-but-not-recorded
+                 * there is no real data to be moved in this extent,
+                 */
+                if (xad->flag & XAD_NOTRECORDED)
+                        goto out;
+                else
+                        /* release xtpage for cmRead()/xtLookup() */
+                        XT_PUTPAGE(pmp);
+                /*
+                 *      cmRelocate()
+                 *
+                 * copy target data pages to be relocated;
+                 *
+                 * data extent must start at page boundary and
+                 * multiple of page size (except the last data extent);
+                 * read in each page of the source data extent into cbuf,
+                 * update the cbuf extent descriptor of the page to be
+                 * homeward bound to new dst data extent
+                 * copy the data from the old extent to new extent.
+                 * copy is essential for compressed files to avoid problems
+                 * that can arise if there was a change in compression
+                 * algorithms.
+                 * it is a good strategy because it may disrupt cache
+                 * policy to keep the pages in memory afterwards.
+                 */
+                offset = xoff << JFS_SBI(ip->i_sb)->l2bsize;
+                assert((offset & CM_OFFSET) == 0);
+                nbytes = xlen << JFS_SBI(ip->i_sb)->l2bsize;
+                pno = offset >> CM_L2BSIZE;
+                npages = (nbytes + (CM_BSIZE - 1)) >> CM_L2BSIZE;
+/*
+                npages = ((offset + nbytes - 1) >> CM_L2BSIZE) -
+                         (offset >> CM_L2BSIZE) + 1;
+*/
+                sxaddr = oxaddr;
+                dxaddr = nxaddr;
+                /* process the request one cache buffer at a time */
+                for (nbrd = 0; nbrd < nbytes; nbrd += nb,
+                     offset += nb, pno++, npages--) {
+                        /* compute page size */
+                        nb = min(nbytes - nbrd, CM_BSIZE);
+                        /* get the cache buffer of the page */
+                        if (rc = cmRead(ip, offset, npages, &cp))
+                                break;
+                        assert(addressPXD(&cp->cm_pxd) == sxaddr);
+                        assert(!cp->cm_modified);
+                        /* bind buffer with the new extent address */
+                        nblks = nb >> JFS_IP(ip->i_sb)->l2bsize;
+                        cmSetXD(ip, cp, pno, dxaddr, nblks);
+                        /* release the cbuf, mark it as modified */
+                        cmPut(cp, TRUE);
+                        dxaddr += nblks;
+                        sxaddr += nblks;
+                }
+                /* get back parent page */
+                if ((rc = xtSearch(ip, xoff, &cmp, &btstack, 0)))
+                        return rc;
+                XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index);
+                jfs_info("xtRelocate: target data extent relocated.");
+        } else {                /* (xtype  == XTPAGE) */
+                /*
+                 * read in the target xtpage from the source extent;
+                 */
+                XT_GETPAGE(ip, oxaddr, mp, PSIZE, p, rc);
+                if (rc) {
+                        XT_PUTPAGE(pmp);
+                        return rc;
+                }
+                /*
+                 * read in sibling pages if any to update sibling pointers;
+                 */
+                rmp = NULL;
+                if (p->header.next) {
+                        nextbn = le64_to_cpu(p->header.next);
+                        XT_GETPAGE(ip, nextbn, rmp, PSIZE, rp, rc);
+                        if (rc) {
+                                XT_PUTPAGE(pmp);
+                                XT_PUTPAGE(mp);
+                                return (rc);
+                        }
+                }
+                lmp = NULL;
+                if (p->header.prev) {
+                        prevbn = le64_to_cpu(p->header.prev);
+                        XT_GETPAGE(ip, prevbn, lmp, PSIZE, lp, rc);
+                        if (rc) {
+                                XT_PUTPAGE(pmp);
+                                XT_PUTPAGE(mp);
+                                if (rmp)
+                                        XT_PUTPAGE(rmp);
+                                return (rc);
+                        }
+                }
+                /* at this point, all xtpages to be updated are in memory */
+                /*
+                 * update sibling pointers of sibling xtpages if any;
+                 */
+                if (lmp) {
+                        BT_MARK_DIRTY(lmp, ip);
+                        tlck =
+                            txLock(tid, ip, lmp, tlckXTREE | tlckRELINK);
+                        lp->header.next = cpu_to_le64(nxaddr);
+                        XT_PUTPAGE(lmp);
+                }
+                if (rmp) {
+                        BT_MARK_DIRTY(rmp, ip);
+                        tlck =
+                            txLock(tid, ip, rmp, tlckXTREE | tlckRELINK);
+                        rp->header.prev = cpu_to_le64(nxaddr);
+                        XT_PUTPAGE(rmp);
+                }
+                /*
+                 * update the target xtpage to be relocated
+                 *
+                 * update the self address of the target page
+                 * and write to destination extent;
+                 * redo image covers the whole xtpage since it is new page
+                 * to the destination extent;
+                 * update of bmap for the free of source extent
+                 * of the target xtpage itself:
+                 * update of bmap for the allocation of destination extent
+                 * of the target xtpage itself:
+                 * update of bmap for the extents covered by xad entries in
+                 * the target xtpage is not necessary since they are not
+                 * updated;
+                 * if not committed before this relocation,
+                 * target page may contain XAD_NEW entries which must
+                 * be scanned for bmap update (logredo() always
+                 * scan xtpage REDOPAGE image for bmap update);
+                 * if committed before this relocation (tlckRELOCATE),
+                 * scan may be skipped by commit() and logredo();
+                 */
+                BT_MARK_DIRTY(mp, ip);
+                /* tlckNEW init  xtlck->lwm.offset = XTENTRYSTART; */
+                tlck = txLock(tid, ip, mp, tlckXTREE | tlckNEW);
+                xtlck = (struct xtlock *) & tlck->lock;
+                /* update the self address in the xtpage header */
+                pxd = &p->header.self;
+                PXDaddress(pxd, nxaddr);
+                /* linelock for the after image of the whole page */
+                xtlck->lwm.length =
+                    le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset;
+                /* update the buffer extent descriptor of target xtpage */
+                xsize = xlen << JFS_SBI(ip->i_sb)->l2bsize;
+                bmSetXD(mp, nxaddr, xsize);
+                /* unpin the target page to new homeward bound */
+                XT_PUTPAGE(mp);
+                jfs_info("xtRelocate: target xtpage relocated.");
+        }
+        /*
+         *      3. acquire maplock for the source extent to be freed;
+         *
+         * acquire a maplock saving the src relocated extent address;
+         * to free of the extent at commit time;
+         */
+      out:
+        /* if DATAEXT relocation, write a LOG_UPDATEMAP record for
+         * free PXD of the source data extent (logredo() will update
+         * bmap for free of source data extent), and update bmap for
+         * free of the source data extent;
+         */
+        if (xtype == DATAEXT)
+                tlck = txMaplock(tid, ip, tlckMAP);
+        /* if XTPAGE relocation, write a LOG_NOREDOPAGE record
+         * for the source xtpage (logredo() will init NoRedoPage
+         * filter and will also update bmap for free of the source
+         * xtpage), and update bmap for free of the source xtpage;
+         * N.B. We use tlckMAP instead of tlkcXTREE because there
+         *      is no buffer associated with this lock since the buffer
+         *      has been redirected to the target location.
+         */
+        else                    /* (xtype  == XTPAGE) */
+                tlck = txMaplock(tid, ip, tlckMAP | tlckRELOCATE);
+        pxdlock = (struct pxd_lock *) & tlck->lock;
+        pxdlock->flag = mlckFREEPXD;
+        PXDaddress(&pxdlock->pxd, oxaddr);
+        PXDlength(&pxdlock->pxd, xlen);
+        pxdlock->index = 1;
+        /*
+         *      4. update the parent xad entry for relocation;
+         *
+         * acquire tlck for the parent entry with XAD_NEW as entry
+         * update which will write LOG_REDOPAGE and update bmap for
+         * allocation of XAD_NEW destination extent;
+         */
+        jfs_info("xtRelocate: update parent xad entry.");
+        BT_MARK_DIRTY(pmp, ip);
+        tlck = txLock(tid, ip, pmp, tlckXTREE | tlckGROW);
+        xtlck = (struct xtlock *) & tlck->lock;
+        /* update the XAD with the new destination extent; */
+        xad = &pp->xad[index];
+        xad->flag |= XAD_NEW;
+        XADaddress(xad, nxaddr);
+        xtlck->lwm.offset = min(index, xtlck->lwm.offset);
+        xtlck->lwm.length = le16_to_cpu(pp->header.nextindex) -
+            xtlck->lwm.offset;
+        /* unpin the parent xtpage */
+        XT_PUTPAGE(pmp);
+        return rc;
+}
+/*
+ *      xtSearchNode()
+ *
+ * function:    search for the internal xad entry covering specified extent.
+ *              This function is mainly used by defragfs utility.
+ *
+ * parameters:
+ *      ip      - file object;
+ *      xad     - extent to find;
+ *      cmpp    - comparison result:
+ *      btstack - traverse stack;
+ *      flag    - search process flag;
+ *
+ * returns:
+ *      btstack contains (bn, index) of search path traversed to the entry.
+ *      *cmpp is set to result of comparison with the entry returned.
+ *      the page containing the entry is pinned at exit.
+ */
+static int xtSearchNode(struct inode *ip, xad_t * xad,  /* required XAD entry */
+                        int *cmpp, struct btstack * btstack, int flag)
+{
+        int rc = 0;
+        s64 xoff, xaddr;
+        int xlen;
+        int cmp = 1;            /* init for empty page */
+        s64 bn;                 /* block number */
+        struct metapage *mp;    /* meta-page buffer */
+        xtpage_t *p;            /* page */
+        int base, index, lim;
+        struct btframe *btsp;
+        s64 t64;
+        BT_CLR(btstack);
+        xoff = offsetXAD(xad);
+        xlen = lengthXAD(xad);
+        xaddr = addressXAD(xad);
+        /*
+         *      search down tree from root:
+         *
+         * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of
+         * internal page, child page Pi contains entry with k, Ki <= K < Kj.
+         *
+         * if entry with search key K is not found
+         * internal page search find the entry with largest key Ki
+         * less than K which point to the child page to search;
+         * leaf page search find the entry with smallest key Kj
+         * greater than K so that the returned index is the position of
+         * the entry to be shifted right for insertion of new entry.
+         * for empty tree, search key is greater than any key of the tree.
+         *
+         * by convention, root bn = 0.
+         */
+        for (bn = 0;;) {
+                /* get/pin the page to search */
+                XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+                if (rc)
+                        return rc;
+                if (p->header.flag & BT_LEAF) {
+                        XT_PUTPAGE(mp);
+                        return -ESTALE;
+                }
+                lim = le16_to_cpu(p->header.nextindex) - XTENTRYSTART;
+                /*
+                 * binary search with search key K on the current page
+                 */
+                for (base = XTENTRYSTART; lim; lim >>= 1) {
+                        index = base + (lim >> 1);
+                        XT_CMP(cmp, xoff, &p->xad[index], t64);
+                        if (cmp == 0) {
+                                /*
+                                 *      search hit
+                                 *
+                                 * verify for exact match;
+                                 */
+                                if (xaddr == addressXAD(&p->xad[index]) &&
+                                    xoff == offsetXAD(&p->xad[index])) {
+                                        *cmpp = cmp;
+                                        /* save search result */
+                                        btsp = btstack->top;
+                                        btsp->bn = bn;
+                                        btsp->index = index;
+                                        btsp->mp = mp;
+                                        return 0;
+                                }
+                                /* descend/search its child page */
+                                goto next;
+                        }
+                        if (cmp > 0) {
+                                base = index + 1;
+                                --lim;
+                        }
+                }
+                /*
+                 *      search miss - non-leaf page:
+                 *
+                 * base is the smallest index with key (Kj) greater than
+                 * search key (K) and may be zero or maxentry index.
+                 * if base is non-zero, decrement base by one to get the parent
+                 * entry of the child page to search.
+                 */
+                index = base ? base - 1 : base;
+                /*
+                 * go down to child page
+                 */
+              next:
+                /* get the child page block number */
+                bn = addressXAD(&p->xad[index]);
+                /* unpin the parent page */
+                XT_PUTPAGE(mp);
+        }
+}
+/*
+ *      xtRelink()
+ *
+ * function:
+ *      link around a freed page.
+ *
+ * Parameter:
+ *      int           tid,
+ *      struct inode    *ip,
+ *      xtpage_t        *p)
+ *
+ * returns:
+ */
+static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * p)
+{
+        int rc = 0;
+        struct metapage *mp;
+        s64 nextbn, prevbn;
+        struct tlock *tlck;
+        nextbn = le64_to_cpu(p->header.next);
+        prevbn = le64_to_cpu(p->header.prev);
+        /* update prev pointer of the next page */
+        if (nextbn != 0) {
+                XT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc);
+                if (rc)
+                        return rc;
+                /*
+                 * acquire a transaction lock on the page;
+                 *
+                 * action: update prev pointer;
+                 */
+                BT_MARK_DIRTY(mp, ip);
+                tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK);
+                /* the page may already have been tlock'd */
+                p->header.prev = cpu_to_le64(prevbn);
+                XT_PUTPAGE(mp);
+        }
+        /* update next pointer of the previous page */
+        if (prevbn != 0) {
+                XT_GETPAGE(ip, prevbn, mp, PSIZE, p, rc);
+                if (rc)
+                        return rc;
+                /*
+                 * acquire a transaction lock on the page;
+                 *
+                 * action: update next pointer;
+                 */
+                BT_MARK_DIRTY(mp, ip);
+                tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK);
+                /* the page may already have been tlock'd */
+                p->header.next = le64_to_cpu(nextbn);
+                XT_PUTPAGE(mp);
+        }
+        return 0;
+}
+#endif                          /*  _STILL_TO_PORT */
+/*
+ *      xtInitRoot()
+ *
+ * initialize file root (inline in inode)
+ */
+void xtInitRoot(tid_t tid, struct inode *ip)
+{
+        xtpage_t *p;
+        /*
+         * acquire a transaction lock on the root
+         *
+         * action:
+         */
+        txLock(tid, ip, (struct metapage *) &JFS_IP(ip)->bxflag,
+                      tlckXTREE | tlckNEW);
+        p = &JFS_IP(ip)->i_xtroot;
+        p->header.flag = DXD_INDEX | BT_ROOT | BT_LEAF;
+        p->header.nextindex = cpu_to_le16(XTENTRYSTART);
+        if (S_ISDIR(ip->i_mode))
+                p->header.maxentry = cpu_to_le16(XTROOTINITSLOT_DIR);
+        else {
+                p->header.maxentry = cpu_to_le16(XTROOTINITSLOT);
+                ip->i_size = 0;
+        }
+        return;
+}
+/*
+ * We can run into a deadlock truncating a file with a large number of
+ * xtree pages (large fragmented file).  A robust fix would entail a
+ * reservation system where we would reserve a number of metadata pages
+ * and tlocks which we would be guaranteed without a deadlock.  Without
+ * this, a partial fix is to limit number of metadata pages we will lock
+ * in a single transaction.  Currently we will truncate the file so that
+ * no more than 50 leaf pages will be locked.  The caller of xtTruncate
+ * will be responsible for ensuring that the current transaction gets
+ * committed, and that subsequent transactions are created to truncate
+ * the file further if needed.
+ */
+#define MAX_TRUNCATE_LEAVES 50
+/*
+ *      xtTruncate()
+ *
+ * function:
+ *      traverse for truncation logging backward bottom up;
+ *      terminate at the last extent entry at the current subtree
+ *      root page covering new down size.
+ *      truncation may occur within the last extent entry.
+ *
+ * parameter:
+ *      int           tid,
+ *      struct inode    *ip,
+ *      s64           newsize,
+ *      int           type)   {PWMAP, PMAP, WMAP; DELETE, TRUNCATE}
+ *
+ * return:
+ *
+ * note:
+ *      PWMAP:
+ *       1. truncate (non-COMMIT_NOLINK file)
+ *          by jfs_truncate() or jfs_open(O_TRUNC):
+ *          xtree is updated;
+ *       2. truncate index table of directory when last entry removed
+ *       map update via tlock at commit time;
+ *      PMAP:
+ *       Call xtTruncate_pmap instead
+ *      WMAP:
+ *       1. remove (free zero link count) on last reference release
+ *          (pmap has been freed at commit zero link count);
+ *       2. truncate (COMMIT_NOLINK file, i.e., tmp file):
+ *          xtree is updated;
+ *       map update directly at truncation time;
+ *
+ *      if (DELETE)
+ *              no LOG_NOREDOPAGE is required (NOREDOFILE is sufficient);
+ *      else if (TRUNCATE)
+ *              must write LOG_NOREDOPAGE for deleted index page;
+ *
+ * pages may already have been tlocked by anonymous transactions
+ * during file growth (i.e., write) before truncation;
+ *
+ * except last truncated entry, deleted entries remains as is
+ * in the page (nextindex is updated) for other use
+ * (e.g., log/update allocation map): this avoid copying the page
+ * info but delay free of pages;
+ *
+ */
+s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
+{
+        int rc = 0;
+        s64 teof;
+        struct metapage *mp;
+        xtpage_t *p;
+        s64 bn;
+        int index, nextindex;
+        xad_t *xad;
+        s64 xoff, xaddr;
+        int xlen, len, freexlen;
+        struct btstack btstack;
+        struct btframe *parent;
+        struct tblock *tblk = NULL;
+        struct tlock *tlck = NULL;
+        struct xtlock *xtlck = NULL;
+        struct xdlistlock xadlock;      /* maplock for COMMIT_WMAP */
+        struct pxd_lock *pxdlock;               /* maplock for COMMIT_WMAP */
+        s64 nfreed;
+        int freed, log;
+        int locked_leaves = 0;
+        /* save object truncation type */
+        if (tid) {
+                tblk = tid_to_tblock(tid);
+                tblk->xflag |= flag;
+        }
+        nfreed = 0;
+        flag &= COMMIT_MAP;
+        assert(flag != COMMIT_PMAP);
+        if (flag == COMMIT_PWMAP)
+                log = 1;
+        else {
+                log = 0;
+                xadlock.flag = mlckFREEXADLIST;
+                xadlock.index = 1;
+        }
+        /*
+         * if the newsize is not an integral number of pages,
+         * the file between newsize and next page boundary will
+         * be cleared.
+         * if truncating into a file hole, it will cause
+         * a full block to be allocated for the logical block.
+         */
+        /*
+         * release page blocks of truncated region <teof, eof>
+         *
+         * free the data blocks from the leaf index blocks.
+         * delete the parent index entries corresponding to
+         * the freed child data/index blocks.
+         * free the index blocks themselves which aren't needed
+         * in new sized file.
+         *
+         * index blocks are updated only if the blocks are to be
+         * retained in the new sized file.
+         * if type is PMAP, the data and index pages are NOT
+         * freed, and the data and index blocks are NOT freed
+         * from  working map.
+         * (this will allow continued access of data/index of
+         * temporary file (zerolink count file truncated to zero-length)).
+         */
+        teof = (newsize + (JFS_SBI(ip->i_sb)->bsize - 1)) >>
+            JFS_SBI(ip->i_sb)->l2bsize;
+        /* clear stack */
+        BT_CLR(&btstack);
+        /*
+         * start with root
+         *
+         * root resides in the inode
+         */
+        bn = 0;
+        /*
+         * first access of each page:
+         */
+      getPage:
+        XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+        if (rc)
+                return rc;
+        /* process entries backward from last index */
+        index = le16_to_cpu(p->header.nextindex) - 1;
+        if (p->header.flag & BT_INTERNAL)
+                goto getChild;
+        /*
+         *      leaf page
+         */
+        /* Since this is the rightmost leaf, and we may have already freed
+         * a page that was formerly to the right, let's make sure that the
+         * next pointer is zero.
+         */
+        if (p->header.next) {
+                if (log)
+                        /*
+                         * Make sure this change to the header is logged.
+                         * If we really truncate this leaf, the flag
+                         * will be changed to tlckTRUNCATE
+                         */
+                        tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW);
+                BT_MARK_DIRTY(mp, ip);
+                p->header.next = 0;
+        }
+        freed = 0;
+        /* does region covered by leaf page precede Teof ? */
+        xad = &p->xad[index];
+        xoff = offsetXAD(xad);
+        xlen = lengthXAD(xad);
+        if (teof >= xoff + xlen) {
+                XT_PUTPAGE(mp);
+                goto getParent;
+        }
+        /* (re)acquire tlock of the leaf page */
+        if (log) {
+                if (++locked_leaves > MAX_TRUNCATE_LEAVES) {
+                        /*
+                         * We need to limit the size of the transaction
+                         * to avoid exhausting pagecache & tlocks
+                         */
+                        XT_PUTPAGE(mp);
+                        newsize = (xoff + xlen) << JFS_SBI(ip->i_sb)->l2bsize;
+                        goto getParent;
+                }
+                tlck = txLock(tid, ip, mp, tlckXTREE);
+                tlck->type = tlckXTREE | tlckTRUNCATE;
+                xtlck = (struct xtlock *) & tlck->lock;
+                xtlck->hwm.offset = le16_to_cpu(p->header.nextindex) - 1;
+        }
+        BT_MARK_DIRTY(mp, ip);
+        /*
+         * scan backward leaf page entries
+         */
+        for (; index >= XTENTRYSTART; index--) {
+                xad = &p->xad[index];
+                xoff = offsetXAD(xad);
+                xlen = lengthXAD(xad);
+                xaddr = addressXAD(xad);
+                /*
+                 * The "data" for a directory is indexed by the block
+                 * device's address space.  This metadata must be invalidated
+                 * here
+                 */
+                if (S_ISDIR(ip->i_mode) && (teof == 0))
+                        invalidate_xad_metapages(ip, *xad);
+                /*
+                 * entry beyond eof: continue scan of current page
+                 *          xad
+                 * ---|---=======------->
+                 *   eof
+                 */
+                if (teof < xoff) {
+                        nfreed += xlen;
+                        continue;
+                }
+                /*
+                 * (xoff <= teof): last entry to be deleted from page;
+                 * If other entries remain in page: keep and update the page.
+                 */
+                /*
+                 * eof == entry_start: delete the entry
+                 *           xad
+                 * -------|=======------->
+                 *       eof
+                 *
+                 */
+                if (teof == xoff) {
+                        nfreed += xlen;
+                        if (index == XTENTRYSTART)
+                                break;
+                        nextindex = index;
+                }
+                /*
+                 * eof within the entry: truncate the entry.
+                 *          xad
+                 * -------===|===------->
+                 *          eof
+                 */
+                else if (teof < xoff + xlen) {
+                        /* update truncated entry */
+                        len = teof - xoff;
+                        freexlen = xlen - len;
+                        XADlength(xad, len);
+                        /* save pxd of truncated extent in tlck */
+                        xaddr += len;
+                        if (log) {      /* COMMIT_PWMAP */
+                                xtlck->lwm.offset = (xtlck->lwm.offset) ?
+                                    min(index, (int)xtlck->lwm.offset) : index;
+                                xtlck->lwm.length = index + 1 -
+                                    xtlck->lwm.offset;
+                                xtlck->twm.offset = index;
+                                pxdlock = (struct pxd_lock *) & xtlck->pxdlock;
+                                pxdlock->flag = mlckFREEPXD;
+                                PXDaddress(&pxdlock->pxd, xaddr);
+                                PXDlength(&pxdlock->pxd, freexlen);
+                        }
+                        /* free truncated extent */
+                        else {  /* COMMIT_WMAP */
+                                pxdlock = (struct pxd_lock *) & xadlock;
+                                pxdlock->flag = mlckFREEPXD;
+                                PXDaddress(&pxdlock->pxd, xaddr);
+                                PXDlength(&pxdlock->pxd, freexlen);
+                                txFreeMap(ip, pxdlock, NULL, COMMIT_WMAP);
+                                /* reset map lock */
+                                xadlock.flag = mlckFREEXADLIST;
+                        }
+                        /* current entry is new last entry; */
+                        nextindex = index + 1;
+                        nfreed += freexlen;
+                }
+                /*
+                 * eof beyond the entry:
+                 *          xad
+                 * -------=======---|--->
+                 *                 eof
+                 */
+                else {          /* (xoff + xlen < teof) */
+                        nextindex = index + 1;
+                }
+                if (nextindex < le16_to_cpu(p->header.nextindex)) {
+                        if (!log) {     /* COMMIT_WAMP */
+                                xadlock.xdlist = &p->xad[nextindex];
+                                xadlock.count =
+                                    le16_to_cpu(p->header.nextindex) -
+                                    nextindex;
+                                txFreeMap(ip, (struct maplock *) & xadlock,
+                                          NULL, COMMIT_WMAP);
+                        }
+                        p->header.nextindex = cpu_to_le16(nextindex);
+                }
+                XT_PUTPAGE(mp);
+                /* assert(freed == 0); */
+                goto getParent;
+        }                       /* end scan of leaf page entries */
+        freed = 1;
+        /*
+         * leaf page become empty: free the page if type != PMAP
+         */
+        if (log) {              /* COMMIT_PWMAP */
+                /* txCommit() with tlckFREE:
+                 * free data extents covered by leaf [XTENTRYSTART:hwm);
+                 * invalidate leaf if COMMIT_PWMAP;
+                 * if (TRUNCATE), will write LOG_NOREDOPAGE;
+                 */
+                tlck->type = tlckXTREE | tlckFREE;
+        } else {                /* COMMIT_WAMP */
+                /* free data extents covered by leaf */
+                xadlock.xdlist = &p->xad[XTENTRYSTART];
+                xadlock.count =
+                    le16_to_cpu(p->header.nextindex) - XTENTRYSTART;
+                txFreeMap(ip, (struct maplock *) & xadlock, NULL, COMMIT_WMAP);
+        }
+        if (p->header.flag & BT_ROOT) {
+                p->header.flag &= ~BT_INTERNAL;
+                p->header.flag |= BT_LEAF;
+                p->header.nextindex = cpu_to_le16(XTENTRYSTART);
+                XT_PUTPAGE(mp); /* debug */
+                goto out;
+        } else {
+                if (log) {      /* COMMIT_PWMAP */
+                        /* page will be invalidated at tx completion
+                         */
+                        XT_PUTPAGE(mp);
+                } else {        /* COMMIT_WMAP */
+                        if (mp->lid)
+                                lid_to_tlock(mp->lid)->flag |= tlckFREELOCK;
+                        /* invalidate empty leaf page */
+                        discard_metapage(mp);
+                }
+        }
+        /*
+         * the leaf page become empty: delete the parent entry
+         * for the leaf page if the parent page is to be kept
+         * in the new sized file.
+         */
+        /*
+         * go back up to the parent page
+         */
+      getParent:
+        /* pop/restore parent entry for the current child page */
+        if ((parent = BT_POP(&btstack)) == NULL)
+                /* current page must have been root */
+                goto out;
+        /* get back the parent page */
+        bn = parent->bn;
+        XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+        if (rc)
+                return rc;
+        index = parent->index;
+        /*
+         * child page was not empty:
+         */
+        if (freed == 0) {
+                /* has any entry deleted from parent ? */
+                if (index < le16_to_cpu(p->header.nextindex) - 1) {
+                        /* (re)acquire tlock on the parent page */
+                        if (log) {      /* COMMIT_PWMAP */
+                                /* txCommit() with tlckTRUNCATE:
+                                 * free child extents covered by parent [);
+                                 */
+                                tlck = txLock(tid, ip, mp, tlckXTREE);
+                                xtlck = (struct xtlock *) & tlck->lock;
+                                if (!(tlck->type & tlckTRUNCATE)) {
+                                        xtlck->hwm.offset =
+                                            le16_to_cpu(p->header.
+                                                        nextindex) - 1;
+                                        tlck->type =
+                                            tlckXTREE | tlckTRUNCATE;
+                                }
+                        } else {        /* COMMIT_WMAP */
+                                /* free child extents covered by parent */
+                                xadlock.xdlist = &p->xad[index + 1];
+                                xadlock.count =
+                                    le16_to_cpu(p->header.nextindex) -
+                                    index - 1;
+                                txFreeMap(ip, (struct maplock *) & xadlock,
+                                          NULL, COMMIT_WMAP);
+                        }
+                        BT_MARK_DIRTY(mp, ip);
+                        p->header.nextindex = cpu_to_le16(index + 1);
+                }
+                XT_PUTPAGE(mp);
+                goto getParent;
+        }
+        /*
+         * child page was empty:
+         */
+        nfreed += lengthXAD(&p->xad[index]);
+        /*
+         * During working map update, child page's tlock must be handled
+         * before parent's.  This is because the parent's tlock will cause
+         * the child's disk space to be marked available in the wmap, so
+         * it's important that the child page be released by that time.
+         *
+         * ToDo:  tlocks should be on doubly-linked list, so we can
+         * quickly remove it and add it to the end.
+         */
+        /*
+         * Move parent page's tlock to the end of the tid's tlock list
+         */
+        if (log && mp->lid && (tblk->last != mp->lid) &&
+            lid_to_tlock(mp->lid)->tid) {
+                lid_t lid = mp->lid;
+                struct tlock *prev;
+                tlck = lid_to_tlock(lid);
+                if (tblk->next == lid)
+                        tblk->next = tlck->next;
+                else {
+                        for (prev = lid_to_tlock(tblk->next);
+                             prev->next != lid;
+                             prev = lid_to_tlock(prev->next)) {
+                                assert(prev->next);
+                        }
+                        prev->next = tlck->next;
+                }
+                lid_to_tlock(tblk->last)->next = lid;
+                tlck->next = 0;
+                tblk->last = lid;
+        }
+        /*
+         * parent page become empty: free the page
+         */
+        if (index == XTENTRYSTART) {
+                if (log) {      /* COMMIT_PWMAP */
+                        /* txCommit() with tlckFREE:
+                         * free child extents covered by parent;
+                         * invalidate parent if COMMIT_PWMAP;
+                         */
+                        tlck = txLock(tid, ip, mp, tlckXTREE);
+                        xtlck = (struct xtlock *) & tlck->lock;
+                        xtlck->hwm.offset =
+                            le16_to_cpu(p->header.nextindex) - 1;
+                        tlck->type = tlckXTREE | tlckFREE;
+                } else {        /* COMMIT_WMAP */
+                        /* free child extents covered by parent */
+                        xadlock.xdlist = &p->xad[XTENTRYSTART];
+                        xadlock.count =
+                            le16_to_cpu(p->header.nextindex) -
+                            XTENTRYSTART;
+                        txFreeMap(ip, (struct maplock *) & xadlock, NULL,
+                                  COMMIT_WMAP);
+                }
+                BT_MARK_DIRTY(mp, ip);
+                if (p->header.flag & BT_ROOT) {
+                        p->header.flag &= ~BT_INTERNAL;
+                        p->header.flag |= BT_LEAF;
+                        p->header.nextindex = cpu_to_le16(XTENTRYSTART);
+                        if (le16_to_cpu(p->header.maxentry) == XTROOTMAXSLOT) {
+                                /*
+                                 * Shrink root down to allow inline
+                                 * EA (otherwise fsck complains)
+                                 */
+                                p->header.maxentry =
+                                    cpu_to_le16(XTROOTINITSLOT);
+                                JFS_IP(ip)->mode2 |= INLINEEA;
+                        }
+                        XT_PUTPAGE(mp); /* debug */
+                        goto out;
+                } else {
+                        if (log) {      /* COMMIT_PWMAP */
+                                /* page will be invalidated at tx completion
+                                 */
+                                XT_PUTPAGE(mp);
+                        } else {        /* COMMIT_WMAP */
+                                if (mp->lid)
+                                        lid_to_tlock(mp->lid)->flag |=
+                                                tlckFREELOCK;
+                                /* invalidate parent page */
+                                discard_metapage(mp);
+                        }
+                        /* parent has become empty and freed:
+                         * go back up to its parent page
+                         */
+                        /* freed = 1; */
+                        goto getParent;
+                }
+        }
+        /*
+         * parent page still has entries for front region;
+         */
+        else {
+                /* try truncate region covered by preceding entry
+                 * (process backward)
+                 */
+                index--;
+                /* go back down to the child page corresponding
+                 * to the entry
+                 */
+                goto getChild;
+        }
+        /*
+         *      internal page: go down to child page of current entry
+         */
+      getChild:
+        /* save current parent entry for the child page */
+        BT_PUSH(&btstack, bn, index);
+        /* get child page */
+        xad = &p->xad[index];
+        bn = addressXAD(xad);
+        /*
+         * first access of each internal entry:
+         */
+        /* release parent page */
+        XT_PUTPAGE(mp);
+        /* process the child page */
+        goto getPage;
+      out:
+        /*
+         * update file resource stat
+         */
+        /* set size
+         */
+        if (S_ISDIR(ip->i_mode) && !newsize)
+                ip->i_size = 1; /* fsck hates zero-length directories */
+        else
+                ip->i_size = newsize;
+        /* update quota allocation to reflect freed blocks */
+        DQUOT_FREE_BLOCK(ip, nfreed);
+        /*
+         * free tlock of invalidated pages
+         */
+        if (flag == COMMIT_WMAP)
+                txFreelock(ip);
+        return newsize;
+}
+/*
+ *      xtTruncate_pmap()
+ *
+ * function:
+ *      Perform truncate to zero lenghth for deleted file, leaving the
+ *      the xtree and working map untouched.  This allows the file to
+ *      be accessed via open file handles, while the delete of the file
+ *      is committed to disk.
+ *
+ * parameter:
+ *      tid_t           tid,
+ *      struct inode    *ip,
+ *      s64             committed_size)
+ *
+ * return: new committed size
+ *
+ * note:
+ *
+ *      To avoid deadlock by holding too many transaction locks, the
+ *      truncation may be broken up into multiple transactions.
+ *      The committed_size keeps track of part of the file has been
+ *      freed from the pmaps.
+ */
+s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
+{
+        s64 bn;
+        struct btstack btstack;
+        int cmp;
+        int index;
+        int locked_leaves = 0;
+        struct metapage *mp;
+        xtpage_t *p;
+        struct btframe *parent;
+        int rc;
+        struct tblock *tblk;
+        struct tlock *tlck = NULL;
+        xad_t *xad;
+        int xlen;
+        s64 xoff;
+        struct xtlock *xtlck = NULL;
+        /* save object truncation type */
+        tblk = tid_to_tblock(tid);
+        tblk->xflag |= COMMIT_PMAP;
+        /* clear stack */
+        BT_CLR(&btstack);
+        if (committed_size) {
+                xoff = (committed_size >> JFS_SBI(ip->i_sb)->l2bsize) - 1;
+                rc = xtSearch(ip, xoff, &cmp, &btstack, 0);
+                if (rc)
+                        return rc;
+                XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
+                if (cmp != 0) {
+                        XT_PUTPAGE(mp);
+                        jfs_error(ip->i_sb,
+                                  "xtTruncate_pmap: did not find extent");
+                        return -EIO;
+                }
+        } else {
+                /*
+                 * start with root
+                 *
+                 * root resides in the inode
+                 */
+                bn = 0;
+                /*
+                 * first access of each page:
+                 */
+      getPage:
+                XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+                if (rc)
+                        return rc;
+                /* process entries backward from last index */
+                index = le16_to_cpu(p->header.nextindex) - 1;
+                if (p->header.flag & BT_INTERNAL)
+                        goto getChild;
+        }
+        /*
+         *      leaf page
+         */
+        if (++locked_leaves > MAX_TRUNCATE_LEAVES) {
+                /*
+                 * We need to limit the size of the transaction
+                 * to avoid exhausting pagecache & tlocks
+                 */
+                xad = &p->xad[index];
+                xoff = offsetXAD(xad);
+                xlen = lengthXAD(xad);
+                XT_PUTPAGE(mp);
+                return  (xoff + xlen) << JFS_SBI(ip->i_sb)->l2bsize;
+        }
+        tlck = txLock(tid, ip, mp, tlckXTREE);
+        tlck->type = tlckXTREE | tlckFREE;
+        xtlck = (struct xtlock *) & tlck->lock;
+        xtlck->hwm.offset = index;
+        XT_PUTPAGE(mp);
+        /*
+         * go back up to the parent page
+         */
+      getParent:
+        /* pop/restore parent entry for the current child page */
+        if ((parent = BT_POP(&btstack)) == NULL)
+                /* current page must have been root */
+                goto out;
+        /* get back the parent page */
+        bn = parent->bn;
+        XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+        if (rc)
+                return rc;
+        index = parent->index;
+        /*
+         * parent page become empty: free the page
+         */
+        if (index == XTENTRYSTART) {
+                /* txCommit() with tlckFREE:
+                 * free child extents covered by parent;
+                 * invalidate parent if COMMIT_PWMAP;
+                 */
+                tlck = txLock(tid, ip, mp, tlckXTREE);
+                xtlck = (struct xtlock *) & tlck->lock;
+                xtlck->hwm.offset =
+                    le16_to_cpu(p->header.nextindex) - 1;
+                tlck->type = tlckXTREE | tlckFREE;
+                XT_PUTPAGE(mp);
+                if (p->header.flag & BT_ROOT) {
+                        goto out;
+                } else {
+                        goto getParent;
+                }
+        }
+        /*
+         * parent page still has entries for front region;
+         */
+        else
+                index--;
+        /*
+         *      internal page: go down to child page of current entry
+         */
+      getChild:
+        /* save current parent entry for the child page */
+        BT_PUSH(&btstack, bn, index);
+        /* get child page */
+        xad = &p->xad[index];
+        bn = addressXAD(xad);
+        /*
+         * first access of each internal entry:
+         */
+        /* release parent page */
+        XT_PUTPAGE(mp);
+        /* process the child page */
+        goto getPage;
+      out:
+        return 0;
+}
+#ifdef _JFS_DEBUG_XTREE
+/*
+ *      xtDisplayTree()
+ *
+ * function: traverse forward
+ */
+int xtDisplayTree(struct inode *ip)
+{
+        int rc = 0;
+        struct metapage *mp;
+        xtpage_t *p;
+        s64 bn, pbn;
+        int index, lastindex, v, h;
+        xad_t *xad;
+        struct btstack btstack;
+        struct btframe *btsp;
+        struct btframe *parent;
+        printk("display B+-tree.\n");
+        /* clear stack */
+        btsp = btstack.stack;
+        /*
+         * start with root
+         *
+         * root resides in the inode
+         */
+        bn = 0;
+        v = h = 0;
+        /*
+         * first access of each page:
+         */
+      getPage:
+        XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+        if (rc)
+                return rc;
+        /* process entries forward from first index */
+        index = XTENTRYSTART;
+        lastindex = le16_to_cpu(p->header.nextindex) - 1;
+        if (p->header.flag & BT_INTERNAL) {
+                /*
+                 * first access of each internal page
+                 */
+                goto getChild;
+        } else {                /* (p->header.flag & BT_LEAF) */
+                /*
+                 * first access of each leaf page
+                 */
+                printf("leaf page ");
+                xtDisplayPage(ip, bn, p);
+                /* unpin the leaf page */
+                XT_PUTPAGE(mp);
+        }
+        /*
+         * go back up to the parent page
+         */
+      getParent:
+        /* pop/restore parent entry for the current child page */
+        if ((parent = (btsp == btstack.stack ? NULL : --btsp)) == NULL)
+                /* current page must have been root */
+                return;
+        /*
+         * parent page scan completed
+         */
+        if ((index = parent->index) == (lastindex = parent->lastindex)) {
+                /* go back up to the parent page */
+                goto getParent;
+        }
+        /*
+         * parent page has entries remaining
+         */
+        /* get back the parent page */
+        bn = parent->bn;
+        /* v = parent->level; */
+        XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+        if (rc)
+                return rc;
+        /* get next parent entry */
+        index++;
+        /*
+         * internal page: go down to child page of current entry
+         */
+      getChild:
+        /* push/save current parent entry for the child page */
+        btsp->bn = pbn = bn;
+        btsp->index = index;
+        btsp->lastindex = lastindex;
+        /* btsp->level = v; */
+        /* btsp->node = h; */
+        ++btsp;
+        /* get child page */
+        xad = &p->xad[index];
+        bn = addressXAD(xad);
+        /*
+         * first access of each internal entry:
+         */
+        /* release parent page */
+        XT_PUTPAGE(mp);
+        printk("traverse down 0x%lx[%d]->0x%lx\n", (ulong) pbn, index,
+               (ulong) bn);
+        v++;
+        h = index;
+        /* process the child page */
+        goto getPage;
+}
+/*
+ *      xtDisplayPage()
+ *
+ * function: display page
+ */
+int xtDisplayPage(struct inode *ip, s64 bn, xtpage_t * p)
+{
+        int rc = 0;
+        xad_t *xad;
+        s64 xaddr, xoff;
+        int xlen, i, j;
+        /* display page control */
+        printf("bn:0x%lx flag:0x%x nextindex:%d\n",
+               (ulong) bn, p->header.flag,
+               le16_to_cpu(p->header.nextindex));
+        /* display entries */
+        xad = &p->xad[XTENTRYSTART];
+                for (i = XTENTRYSTART, j = 1; i < le16_to_cpu(p->header.nextindex);
+                     i++, xad++, j++) {
+                        xoff = offsetXAD(xad);
+                        xaddr = addressXAD(xad);
+                        xlen = lengthXAD(xad);
+                        printf("\t[%d] 0x%lx:0x%lx(0x%x)", i, (ulong) xoff,
+                               (ulong) xaddr, xlen);
+                        if (j == 4) {
+                                printf("\n");
+                                j = 0;
+                }
+        }
+        printf("\n");
+}
+#endif                          /* _JFS_DEBUG_XTREE */
+#ifdef _JFS_WIP
+/*
+ *      xtGather()
+ *
+ * function:
+ *      traverse for allocation acquiring tlock at commit time
+ *      (vs at the time of update) logging backward top down
+ *
+ * note:
+ *      problem - establishing that all new allocation have been
+ *      processed both for append and random write in sparse file
+ *      at the current entry at the current subtree root page
+ *
+ */
+int xtGather(btree_t *t)
+{
+        int rc = 0;
+        xtpage_t *p;
+        u64 bn;
+        int index;
+        btentry_t *e;
+        struct btstack btstack;
+        struct btsf *parent;
+        /* clear stack */
+        BT_CLR(&btstack);
+        /*
+         * start with root
+         *
+         * root resides in the inode
+         */
+        bn = 0;
+        XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+        if (rc)
+                return rc;
+        /* new root is NOT pointed by a new entry
+           if (p->header.flag & NEW)
+           allocate new page lock;
+           write a NEWPAGE log;
+         */
+      dopage:
+        /*
+         * first access of each page:
+         */
+        /* process entries backward from last index */
+        index = le16_to_cpu(p->header.nextindex) - 1;
+        if (p->header.flag & BT_LEAF) {
+                /*
+                 * first access of each leaf page
+                 */
+                /* process leaf page entries backward */
+                for (; index >= XTENTRYSTART; index--) {
+                        e = &p->xad[index];
+                        /*
+                         * if newpage, log NEWPAGE.
+                         *
+                         if (e->flag & XAD_NEW) {
+                         nfound =+ entry->length;
+                         update current page lock for the entry;
+                         newpage(entry);
+                         *
+                         * if moved, log move.
+                         *
+                         } else if (e->flag & XAD_MOVED) {
+                         reset flag;
+                         update current page lock for the entry;
+                         }
+                         */
+                }
+                /* unpin the leaf page */
+                XT_PUTPAGE(mp);
+                /*
+                 * go back up to the parent page
+                 */
+              getParent:
+                /* restore parent entry for the current child page */
+                if ((parent = BT_POP(&btstack)) == NULL)
+                        /* current page must have been root */
+                        return 0;
+                if ((index = parent->index) == XTENTRYSTART) {
+                        /*
+                         * parent page scan completed
+                         */
+                        /* go back up to the parent page */
+                        goto getParent;
+                } else {
+                        /*
+                         * parent page has entries remaining
+                         */
+                        /* get back the parent page */
+                        bn = parent->bn;
+                        XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+                        if (rc)
+                                return -EIO;
+                        /* first subroot page which
+                         * covers all new allocated blocks
+                         * itself not new/modified.
+                         * (if modified from split of descendent,
+                         * go down path of split page)
+                         if (nfound == nnew &&
+                         !(p->header.flag & (NEW | MOD)))
+                         exit scan;
+                         */
+                        /* process parent page entries backward */
+                        index--;
+                }
+        } else {
+                /*
+                 * first access of each internal page
+                 */
+        }
+        /*
+         * internal page: go down to child page of current entry
+         */
+        /* save current parent entry for the child page */
+        BT_PUSH(&btstack, bn, index);
+        /* get current entry for the child page */
+        e = &p->xad[index];
+        /*
+         * first access of each internal entry:
+         */
+        /*
+         * if new entry, log btree_tnewentry.
+         *
+         if (e->flag & XAD_NEW)
+         update parent page lock for the entry;
+         */
+        /* release parent page */
+        XT_PUTPAGE(mp);
+        /* get child page */
+        bn = e->bn;
+        XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+        if (rc)
+                return rc;
+        /*
+         * first access of each non-root page:
+         */
+        /*
+         * if new, log btree_newpage.
+         *
+         if (p->header.flag & NEW)
+         allocate new page lock;
+         write a NEWPAGE log (next, prev);
+         */
+        /* process the child page */
+        goto dopage;
+      out:
+        return 0;
+}
+#endif                          /* _JFS_WIP */
+#ifdef CONFIG_JFS_STATISTICS
+int jfs_xtstat_read(char *buffer, char **start, off_t offset, int length,
+                    int *eof, void *data)
+{
+        int len = 0;
+        off_t begin;
+        len += sprintf(buffer,
+                       "JFS Xtree statistics\n"
+                       "====================\n"
+                       "searches = %d\n"
+                       "fast searches = %d\n"
+                       "splits = %d\n",
+                       xtStat.search,
+                       xtStat.fastSearch,
+                       xtStat.split);
+        begin = offset;
+        *start = buffer + begin;
+        len -= begin;
+        if (len > length)
+                len = length;
+        else
+                *eof = 1;
+        if (len < 0)
+                len = 0;
+        return len;
+}
+#endif
diff --git a/fs/jfs/jfs_xtree.h b/fs/jfs/jfs_xtree.h
new file mode 100644
index 000000000000..a69784254fe7
--- /dev/null
+++ b/fs/jfs/jfs_xtree.h
@@ -0,0 +1,140 @@
+/*
+ *   Copyright (c) International Business Machines Corp., 2000-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _H_JFS_XTREE
+#define _H_JFS_XTREE
+/*
+ *      jfs_xtree.h: extent allocation descriptor B+-tree manager
+ */
+#include "jfs_btree.h"
+/*
+ *      extent allocation descriptor (xad)
+ */
+typedef struct xad {
+        unsigned flag:8;        /* 1: flag */
+        unsigned rsvrd:16;      /* 2: reserved */
+        unsigned off1:8;        /* 1: offset in unit of fsblksize */
+        __le32 off2;            /* 4: offset in unit of fsblksize */
+        unsigned len:24;        /* 3: length in unit of fsblksize */
+        unsigned addr1:8;       /* 1: address in unit of fsblksize */
+        __le32 addr2;           /* 4: address in unit of fsblksize */
+} xad_t;                        /* (16) */
+#define MAXXLEN         ((1 << 24) - 1)
+#define XTSLOTSIZE      16
+#define L2XTSLOTSIZE    4
+/* xad_t field construction */
+#define XADoffset(xad, offset64)\
+{\
+        (xad)->off1 = ((u64)offset64) >> 32;\
+        (xad)->off2 = __cpu_to_le32((offset64) & 0xffffffff);\
+}
+#define XADaddress(xad, address64)\
+{\
+        (xad)->addr1 = ((u64)address64) >> 32;\
+        (xad)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\
+}
+#define XADlength(xad, length32)        (xad)->len = __cpu_to_le24(length32)
+/* xad_t field extraction */
+#define offsetXAD(xad)\
+        ( ((s64)((xad)->off1)) << 32 | __le32_to_cpu((xad)->off2))
+#define addressXAD(xad)\
+        ( ((s64)((xad)->addr1)) << 32 | __le32_to_cpu((xad)->addr2))
+#define lengthXAD(xad)  __le24_to_cpu((xad)->len)
+/* xad list */
+struct xadlist {
+        s16 maxnxad;
+        s16 nxad;
+        xad_t *xad;
+};
+/* xad_t flags */
+#define XAD_NEW         0x01    /* new */
+#define XAD_EXTENDED    0x02    /* extended */
+#define XAD_COMPRESSED  0x04    /* compressed with recorded length */
+#define XAD_NOTRECORDED 0x08    /* allocated but not recorded */
+#define XAD_COW         0x10    /* copy-on-write */
+/* possible values for maxentry */
+#define XTROOTINITSLOT_DIR  6
+#define XTROOTINITSLOT  10
+#define XTROOTMAXSLOT   18
+#define XTPAGEMAXSLOT   256
+#define XTENTRYSTART    2
+/*
+ *      xtree page:
+ */
+typedef union {
+        struct xtheader {
+                __le64 next;    /* 8: */
+                __le64 prev;    /* 8: */
+                u8 flag;        /* 1: */
+                u8 rsrvd1;      /* 1: */
+                __le16 nextindex;       /* 2: next index = number of entries */
+                __le16 maxentry;        /* 2: max number of entries */
+                __le16 rsrvd2;  /* 2: */
+                pxd_t self;     /* 8: self */
+        } header;               /* (32) */
+        xad_t xad[XTROOTMAXSLOT];       /* 16 * maxentry: xad array */
+} xtpage_t;
+/*
+ *      external declaration
+ */
+extern int xtLookup(struct inode *ip, s64 lstart, s64 llen,
+                    int *pflag, s64 * paddr, int *plen, int flag);
+extern int xtLookupList(struct inode *ip, struct lxdlist * lxdlist,
+                        struct xadlist * xadlist, int flag);
+extern void xtInitRoot(tid_t tid, struct inode *ip);
+extern int xtInsert(tid_t tid, struct inode *ip,
+                    int xflag, s64 xoff, int xlen, s64 * xaddrp, int flag);
+extern int xtExtend(tid_t tid, struct inode *ip, s64 xoff, int xlen,
+                    int flag);
+#ifdef _NOTYET
+extern int xtTailgate(tid_t tid, struct inode *ip,
+                      s64 xoff, int xlen, s64 xaddr, int flag);
+#endif
+extern int xtUpdate(tid_t tid, struct inode *ip, struct xad *nxad);
+extern int xtDelete(tid_t tid, struct inode *ip, s64 xoff, int xlen,
+                    int flag);
+extern s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int type);
+extern s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size);
+extern int xtRelocate(tid_t tid, struct inode *ip,
+                      xad_t * oxad, s64 nxaddr, int xtype);
+extern int xtAppend(tid_t tid,
+                    struct inode *ip, int xflag, s64 xoff, int maxblocks,
+                    int *xlenp, s64 * xaddrp, int flag);
+#ifdef  _JFS_DEBUG_XTREE
+extern int xtDisplayTree(struct inode *ip);
+extern int xtDisplayPage(struct inode *ip, s64 bn, xtpage_t * p);
+#endif                          /* _JFS_DEBUG_XTREE */
+#endif                          /* !_H_JFS_XTREE */
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
new file mode 100644
index 000000000000..8413a368f449
--- /dev/null
+++ b/fs/jfs/namei.c
@@ -0,0 +1,1540 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *   Portions Copyright (C) Christoph Hellwig, 2001-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/fs.h>
+#include <linux/ctype.h>
+#include <linux/quotaops.h>
+#include "jfs_incore.h"
+#include "jfs_superblock.h"
+#include "jfs_inode.h"
+#include "jfs_dinode.h"
+#include "jfs_dmap.h"
+#include "jfs_unicode.h"
+#include "jfs_metapage.h"
+#include "jfs_xattr.h"
+#include "jfs_acl.h"
+#include "jfs_debug.h"
+extern struct inode_operations jfs_file_inode_operations;
+extern struct inode_operations jfs_symlink_inode_operations;
+extern struct file_operations jfs_file_operations;
+extern struct address_space_operations jfs_aops;
+extern int jfs_fsync(struct file *, struct dentry *, int);
+extern void jfs_truncate_nolock(struct inode *, loff_t);
+extern int jfs_init_acl(struct inode *, struct inode *);
+/*
+ * forward references
+ */
+struct inode_operations jfs_dir_inode_operations;
+struct file_operations jfs_dir_operations;
+struct dentry_operations jfs_ci_dentry_operations;
+static s64 commitZeroLink(tid_t, struct inode *);
+/*
+ * NAME:        jfs_create(dip, dentry, mode)
+ *
+ * FUNCTION:    create a regular file in the parent directory <dip>
+ *              with name = <from dentry> and mode = <mode>
+ *
+ * PARAMETER:   dip     - parent directory vnode
+ *              dentry  - dentry of new file
+ *              mode    - create mode (rwxrwxrwx).
+ *              nd- nd struct
+ *
+ * RETURN:      Errors from subroutines
+ *
+ */
+static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
+                struct nameidata *nd)
+{
+        int rc = 0;
+        tid_t tid;              /* transaction id */
+        struct inode *ip = NULL;        /* child directory inode */
+        ino_t ino;
+        struct component_name dname;    /* child directory name */
+        struct btstack btstack;
+        struct inode *iplist[2];
+        struct tblock *tblk;
+        jfs_info("jfs_create: dip:0x%p name:%s", dip, dentry->d_name.name);
+        /*
+         * search parent directory for entry/freespace
+         * (dtSearch() returns parent directory page pinned)
+         */
+        if ((rc = get_UCSname(&dname, dentry)))
+                goto out1;
+        /*
+         * Either iAlloc() or txBegin() may block.  Deadlock can occur if we
+         * block there while holding dtree page, so we allocate the inode &
+         * begin the transaction before we search the directory.
+         */
+        ip = ialloc(dip, mode);
+        if (ip == NULL) {
+                rc = -ENOSPC;
+                goto out2;
+        }
+        tid = txBegin(dip->i_sb, 0);
+        down(&JFS_IP(dip)->commit_sem);
+        down(&JFS_IP(ip)->commit_sem);
+        if ((rc = dtSearch(dip, &dname, &ino, &btstack, JFS_CREATE))) {
+                jfs_err("jfs_create: dtSearch returned %d", rc);
+                goto out3;
+        }
+        tblk = tid_to_tblock(tid);
+        tblk->xflag |= COMMIT_CREATE;
+        tblk->ino = ip->i_ino;
+        tblk->u.ixpxd = JFS_IP(ip)->ixpxd;
+        iplist[0] = dip;
+        iplist[1] = ip;
+        /*
+         * initialize the child XAD tree root in-line in inode
+         */
+        xtInitRoot(tid, ip);
+        /*
+         * create entry in parent directory for child directory
+         * (dtInsert() releases parent directory page)
+         */
+        ino = ip->i_ino;
+        if ((rc = dtInsert(tid, dip, &dname, &ino, &btstack))) {
+                if (rc == -EIO) {
+                        jfs_err("jfs_create: dtInsert returned -EIO");
+                        txAbort(tid, 1);        /* Marks Filesystem dirty */
+                } else
+                        txAbort(tid, 0);        /* Filesystem full */
+                goto out3;
+        }
+        ip->i_op = &jfs_file_inode_operations;
+        ip->i_fop = &jfs_file_operations;
+        ip->i_mapping->a_ops = &jfs_aops;
+        insert_inode_hash(ip);
+        mark_inode_dirty(ip);
+        dip->i_ctime = dip->i_mtime = CURRENT_TIME;
+        mark_inode_dirty(dip);
+        rc = txCommit(tid, 2, &iplist[0], 0);
+      out3:
+        txEnd(tid);
+        up(&JFS_IP(dip)->commit_sem);
+        up(&JFS_IP(ip)->commit_sem);
+        if (rc) {
+                ip->i_nlink = 0;
+                iput(ip);
+        } else
+                d_instantiate(dentry, ip);
+      out2:
+        free_UCSname(&dname);
+#ifdef CONFIG_JFS_POSIX_ACL
+        if (rc == 0)
+                jfs_init_acl(ip, dip);
+#endif
+      out1:
+        jfs_info("jfs_create: rc:%d", rc);
+        return rc;
+}
+/*
+ * NAME:        jfs_mkdir(dip, dentry, mode)
+ *
+ * FUNCTION:    create a child directory in the parent directory <dip>
+ *              with name = <from dentry> and mode = <mode>
+ *
+ * PARAMETER:   dip     - parent directory vnode
+ *              dentry  - dentry of child directory
+ *              mode    - create mode (rwxrwxrwx).
+ *
+ * RETURN:      Errors from subroutines
+ *
+ * note:
+ * EACCESS: user needs search+write permission on the parent directory
+ */
+static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
+{
+        int rc = 0;
+        tid_t tid;              /* transaction id */
+        struct inode *ip = NULL;        /* child directory inode */
+        ino_t ino;
+        struct component_name dname;    /* child directory name */
+        struct btstack btstack;
+        struct inode *iplist[2];
+        struct tblock *tblk;
+        jfs_info("jfs_mkdir: dip:0x%p name:%s", dip, dentry->d_name.name);
+        /* link count overflow on parent directory ? */
+        if (dip->i_nlink == JFS_LINK_MAX) {
+                rc = -EMLINK;
+                goto out1;
+        }
+        /*
+         * search parent directory for entry/freespace
+         * (dtSearch() returns parent directory page pinned)
+         */
+        if ((rc = get_UCSname(&dname, dentry)))
+                goto out1;
+        /*
+         * Either iAlloc() or txBegin() may block.  Deadlock can occur if we
+         * block there while holding dtree page, so we allocate the inode &
+         * begin the transaction before we search the directory.
+         */
+        ip = ialloc(dip, S_IFDIR | mode);
+        if (ip == NULL) {
+                rc = -ENOSPC;
+                goto out2;
+        }
+        tid = txBegin(dip->i_sb, 0);
+        down(&JFS_IP(dip)->commit_sem);
+        down(&JFS_IP(ip)->commit_sem);
+        if ((rc = dtSearch(dip, &dname, &ino, &btstack, JFS_CREATE))) {
+                jfs_err("jfs_mkdir: dtSearch returned %d", rc);
+                goto out3;
+        }
+        tblk = tid_to_tblock(tid);
+        tblk->xflag |= COMMIT_CREATE;
+        tblk->ino = ip->i_ino;
+        tblk->u.ixpxd = JFS_IP(ip)->ixpxd;
+        iplist[0] = dip;
+        iplist[1] = ip;
+        /*
+         * initialize the child directory in-line in inode
+         */
+        dtInitRoot(tid, ip, dip->i_ino);
+        /*
+         * create entry in parent directory for child directory
+         * (dtInsert() releases parent directory page)
+         */
+        ino = ip->i_ino;
+        if ((rc = dtInsert(tid, dip, &dname, &ino, &btstack))) {
+                if (rc == -EIO) {
+                        jfs_err("jfs_mkdir: dtInsert returned -EIO");
+                        txAbort(tid, 1);        /* Marks Filesystem dirty */
+                } else
+                        txAbort(tid, 0);        /* Filesystem full */
+                goto out3;
+        }
+        ip->i_nlink = 2;        /* for '.' */
+        ip->i_op = &jfs_dir_inode_operations;
+        ip->i_fop = &jfs_dir_operations;
+        insert_inode_hash(ip);
+        mark_inode_dirty(ip);
+        /* update parent directory inode */
+        dip->i_nlink++;         /* for '..' from child directory */
+        dip->i_ctime = dip->i_mtime = CURRENT_TIME;
+        mark_inode_dirty(dip);
+        rc = txCommit(tid, 2, &iplist[0], 0);
+      out3:
+        txEnd(tid);
+        up(&JFS_IP(dip)->commit_sem);
+        up(&JFS_IP(ip)->commit_sem);
+        if (rc) {
+                ip->i_nlink = 0;
+                iput(ip);
+        } else
+                d_instantiate(dentry, ip);
+      out2:
+        free_UCSname(&dname);
+#ifdef CONFIG_JFS_POSIX_ACL
+        if (rc == 0)
+                jfs_init_acl(ip, dip);
+#endif
+      out1:
+        jfs_info("jfs_mkdir: rc:%d", rc);
+        return rc;
+}
+/*
+ * NAME:        jfs_rmdir(dip, dentry)
+ *
+ * FUNCTION:    remove a link to child directory
+ *
+ * PARAMETER:   dip     - parent inode
+ *              dentry  - child directory dentry
+ *
+ * RETURN:      -EINVAL - if name is . or ..
+ *              -EINVAL  - if . or .. exist but are invalid.
+ *              errors from subroutines
+ *
+ * note:
+ * if other threads have the directory open when the last link 
+ * is removed, the "." and ".." entries, if present, are removed before 
+ * rmdir() returns and no new entries may be created in the directory, 
+ * but the directory is not removed until the last reference to 
+ * the directory is released (cf.unlink() of regular file).
+ */
+static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
+{
+        int rc;
+        tid_t tid;              /* transaction id */
+        struct inode *ip = dentry->d_inode;
+        ino_t ino;
+        struct component_name dname;
+        struct inode *iplist[2];
+        struct tblock *tblk;
+        jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name);
+        /* Init inode for quota operations. */
+        DQUOT_INIT(ip);
+        /* directory must be empty to be removed */
+        if (!dtEmpty(ip)) {
+                rc = -ENOTEMPTY;
+                goto out;
+        }
+        if ((rc = get_UCSname(&dname, dentry))) {
+                goto out;
+        }
+        tid = txBegin(dip->i_sb, 0);
+        down(&JFS_IP(dip)->commit_sem);
+        down(&JFS_IP(ip)->commit_sem);
+        iplist[0] = dip;
+        iplist[1] = ip;
+        tblk = tid_to_tblock(tid);
+        tblk->xflag |= COMMIT_DELETE;
+        tblk->u.ip = ip;
+        /*
+         * delete the entry of target directory from parent directory
+         */
+        ino = ip->i_ino;
+        if ((rc = dtDelete(tid, dip, &dname, &ino, JFS_REMOVE))) {
+                jfs_err("jfs_rmdir: dtDelete returned %d", rc);
+                if (rc == -EIO)
+                        txAbort(tid, 1);
+                txEnd(tid);
+                up(&JFS_IP(dip)->commit_sem);
+                up(&JFS_IP(ip)->commit_sem);
+                goto out2;
+        }
+        /* update parent directory's link count corresponding
+         * to ".." entry of the target directory deleted
+         */
+        dip->i_nlink--;
+        dip->i_ctime = dip->i_mtime = CURRENT_TIME;
+        mark_inode_dirty(dip);
+        /*
+         * OS/2 could have created EA and/or ACL
+         */
+        /* free EA from both persistent and working map */
+        if (JFS_IP(ip)->ea.flag & DXD_EXTENT) {
+                /* free EA pages */
+                txEA(tid, ip, &JFS_IP(ip)->ea, NULL);
+        }
+        JFS_IP(ip)->ea.flag = 0;
+        /* free ACL from both persistent and working map */
+        if (JFS_IP(ip)->acl.flag & DXD_EXTENT) {
+                /* free ACL pages */
+                txEA(tid, ip, &JFS_IP(ip)->acl, NULL);
+        }
+        JFS_IP(ip)->acl.flag = 0;
+        /* mark the target directory as deleted */
+        ip->i_nlink = 0;
+        mark_inode_dirty(ip);
+        rc = txCommit(tid, 2, &iplist[0], 0);
+        txEnd(tid);
+        up(&JFS_IP(dip)->commit_sem);
+        up(&JFS_IP(ip)->commit_sem);
+        /*
+         * Truncating the directory index table is not guaranteed.  It
+         * may need to be done iteratively
+         */
+        if (test_cflag(COMMIT_Stale, dip)) {
+                if (dip->i_size > 1)
+                        jfs_truncate_nolock(dip, 0);
+                clear_cflag(COMMIT_Stale, dip);
+        }
+      out2:
+        free_UCSname(&dname);
+      out:
+        jfs_info("jfs_rmdir: rc:%d", rc);
+        return rc;
+}
+/*
+ * NAME:        jfs_unlink(dip, dentry)
+ *
+ * FUNCTION:    remove a link to object <vp> named by <name> 
+ *              from parent directory <dvp>
+ *
+ * PARAMETER:   dip     - inode of parent directory
+ *              dentry  - dentry of object to be removed
+ *
+ * RETURN:      errors from subroutines
+ *
+ * note:
+ * temporary file: if one or more processes have the file open
+ * when the last link is removed, the link will be removed before
+ * unlink() returns, but the removal of the file contents will be
+ * postponed until all references to the files are closed.
+ *
+ * JFS does NOT support unlink() on directories.
+ *
+ */
+static int jfs_unlink(struct inode *dip, struct dentry *dentry)
+{
+        int rc;
+        tid_t tid;              /* transaction id */
+        struct inode *ip = dentry->d_inode;
+        ino_t ino;
+        struct component_name dname;    /* object name */
+        struct inode *iplist[2];
+        struct tblock *tblk;
+        s64 new_size = 0;
+        int commit_flag;
+        jfs_info("jfs_unlink: dip:0x%p name:%s", dip, dentry->d_name.name);
+        /* Init inode for quota operations. */
+        DQUOT_INIT(ip);
+        if ((rc = get_UCSname(&dname, dentry)))
+                goto out;
+        IWRITE_LOCK(ip);
+        tid = txBegin(dip->i_sb, 0);
+        down(&JFS_IP(dip)->commit_sem);
+        down(&JFS_IP(ip)->commit_sem);
+        iplist[0] = dip;
+        iplist[1] = ip;
+        /*
+         * delete the entry of target file from parent directory
+         */
+        ino = ip->i_ino;
+        if ((rc = dtDelete(tid, dip, &dname, &ino, JFS_REMOVE))) {
+                jfs_err("jfs_unlink: dtDelete returned %d", rc);
+                if (rc == -EIO)
+                        txAbort(tid, 1);        /* Marks FS Dirty */
+                txEnd(tid);
+                up(&JFS_IP(dip)->commit_sem);
+                up(&JFS_IP(ip)->commit_sem);
+                IWRITE_UNLOCK(ip);
+                goto out1;
+        }
+        ASSERT(ip->i_nlink);
+        ip->i_ctime = dip->i_ctime = dip->i_mtime = CURRENT_TIME;
+        mark_inode_dirty(dip);
+        /* update target's inode */
+        ip->i_nlink--;
+        mark_inode_dirty(ip);
+        /*
+         *      commit zero link count object
+         */
+        if (ip->i_nlink == 0) {
+                assert(!test_cflag(COMMIT_Nolink, ip));
+                /* free block resources */
+                if ((new_size = commitZeroLink(tid, ip)) < 0) {
+                        txAbort(tid, 1);        /* Marks FS Dirty */
+                        txEnd(tid);
+                        up(&JFS_IP(dip)->commit_sem);
+                        up(&JFS_IP(ip)->commit_sem);
+                        IWRITE_UNLOCK(ip);
+                        rc = new_size;
+                        goto out1;
+                }
+                tblk = tid_to_tblock(tid);
+                tblk->xflag |= COMMIT_DELETE;
+                tblk->u.ip = ip;
+        }
+        /*
+         * Incomplete truncate of file data can
+         * result in timing problems unless we synchronously commit the
+         * transaction.
+         */
+        if (new_size)
+                commit_flag = COMMIT_SYNC;
+        else
+                commit_flag = 0;
+        /*
+         * If xtTruncate was incomplete, commit synchronously to avoid
+         * timing complications
+         */
+        rc = txCommit(tid, 2, &iplist[0], commit_flag);
+        txEnd(tid);
+        up(&JFS_IP(dip)->commit_sem);
+        up(&JFS_IP(ip)->commit_sem);
+        while (new_size && (rc == 0)) {
+                tid = txBegin(dip->i_sb, 0);
+                down(&JFS_IP(ip)->commit_sem);
+                new_size = xtTruncate_pmap(tid, ip, new_size);
+                if (new_size < 0) {
+                        txAbort(tid, 1);        /* Marks FS Dirty */
+                        rc = new_size;
+                } else
+                        rc = txCommit(tid, 2, &iplist[0], COMMIT_SYNC);
+                txEnd(tid);
+                up(&JFS_IP(ip)->commit_sem);
+        }
+        if (ip->i_nlink == 0)
+                set_cflag(COMMIT_Nolink, ip);
+        IWRITE_UNLOCK(ip);
+        /*
+         * Truncating the directory index table is not guaranteed.  It
+         * may need to be done iteratively
+         */
+        if (test_cflag(COMMIT_Stale, dip)) {
+                if (dip->i_size > 1)
+                        jfs_truncate_nolock(dip, 0);
+                clear_cflag(COMMIT_Stale, dip);
+        }
+      out1:
+        free_UCSname(&dname);
+      out:
+        jfs_info("jfs_unlink: rc:%d", rc);
+        return rc;
+}
+/*
+ * NAME:        commitZeroLink()
+ *
+ * FUNCTION:    for non-directory, called by jfs_remove(),
+ *              truncate a regular file, directory or symbolic
+ *              link to zero length. return 0 if type is not 
+ *              one of these.
+ *
+ *              if the file is currently associated with a VM segment
+ *              only permanent disk and inode map resources are freed,
+ *              and neither the inode nor indirect blocks are modified
+ *              so that the resources can be later freed in the work
+ *              map by ctrunc1.
+ *              if there is no VM segment on entry, the resources are
+ *              freed in both work and permanent map.
+ *              (? for temporary file - memory object is cached even 
+ *              after no reference:
+ *              reference count > 0 -   )
+ *
+ * PARAMETERS:  cd      - pointer to commit data structure.
+ *                        current inode is the one to truncate.
+ *
+ * RETURN:      Errors from subroutines
+ */
+static s64 commitZeroLink(tid_t tid, struct inode *ip)
+{
+        int filetype;
+        struct tblock *tblk;
+        jfs_info("commitZeroLink: tid = %d, ip = 0x%p", tid, ip);
+        filetype = ip->i_mode & S_IFMT;
+        switch (filetype) {
+        case S_IFREG:
+                break;
+        case S_IFLNK:
+                /* fast symbolic link */
+                if (ip->i_size < IDATASIZE) {
+                        ip->i_size = 0;
+                        return 0;
+                }
+                break;
+        default:
+                assert(filetype != S_IFDIR);
+                return 0;
+        }
+        set_cflag(COMMIT_Freewmap, ip);
+        /* mark transaction of block map update type */
+        tblk = tid_to_tblock(tid);
+        tblk->xflag |= COMMIT_PMAP;
+        /*
+         * free EA
+         */
+        if (JFS_IP(ip)->ea.flag & DXD_EXTENT)
+                /* acquire maplock on EA to be freed from block map */
+                txEA(tid, ip, &JFS_IP(ip)->ea, NULL);
+        /*
+         * free ACL
+         */
+        if (JFS_IP(ip)->acl.flag & DXD_EXTENT)
+                /* acquire maplock on EA to be freed from block map */
+                txEA(tid, ip, &JFS_IP(ip)->acl, NULL);
+        /*
+         * free xtree/data (truncate to zero length):
+         * free xtree/data pages from cache if COMMIT_PWMAP, 
+         * free xtree/data blocks from persistent block map, and
+         * free xtree/data blocks from working block map if COMMIT_PWMAP;
+         */
+        if (ip->i_size)
+                return xtTruncate_pmap(tid, ip, 0);
+        return 0;
+}
+/*
+ * NAME:        freeZeroLink()
+ *
+ * FUNCTION:    for non-directory, called by iClose(),
+ *              free resources of a file from cache and WORKING map 
+ *              for a file previously committed with zero link count
+ *              while associated with a pager object,
+ *
+ * PARAMETER:   ip      - pointer to inode of file.
+ *
+ * RETURN:      0 -ok
+ */
+int freeZeroLink(struct inode *ip)
+{
+        int rc = 0;
+        int type;
+        jfs_info("freeZeroLink: ip = 0x%p", ip);
+        /* return if not reg or symbolic link or if size is
+         * already ok.
+         */
+        type = ip->i_mode & S_IFMT;
+        switch (type) {
+        case S_IFREG:
+                break;
+        case S_IFLNK:
+                /* if its contained in inode nothing to do */
+                if (ip->i_size < IDATASIZE)
+                        return 0;
+                break;
+        default:
+                return 0;
+        }
+        /*
+         * free EA
+         */
+        if (JFS_IP(ip)->ea.flag & DXD_EXTENT) {
+                s64 xaddr = addressDXD(&JFS_IP(ip)->ea);
+                int xlen = lengthDXD(&JFS_IP(ip)->ea);
+                struct maplock maplock; /* maplock for COMMIT_WMAP */
+                struct pxd_lock *pxdlock;       /* maplock for COMMIT_WMAP */
+                /* free EA pages from cache */
+                invalidate_dxd_metapages(ip, JFS_IP(ip)->ea);
+                /* free EA extent from working block map */
+                maplock.index = 1;
+                pxdlock = (struct pxd_lock *) & maplock;
+                pxdlock->flag = mlckFREEPXD;
+                PXDaddress(&pxdlock->pxd, xaddr);
+                PXDlength(&pxdlock->pxd, xlen);
+                txFreeMap(ip, pxdlock, NULL, COMMIT_WMAP);
+        }
+        /*
+         * free ACL
+         */
+        if (JFS_IP(ip)->acl.flag & DXD_EXTENT) {
+                s64 xaddr = addressDXD(&JFS_IP(ip)->acl);
+                int xlen = lengthDXD(&JFS_IP(ip)->acl);
+                struct maplock maplock; /* maplock for COMMIT_WMAP */
+                struct pxd_lock *pxdlock;       /* maplock for COMMIT_WMAP */
+                invalidate_dxd_metapages(ip, JFS_IP(ip)->acl);
+                /* free ACL extent from working block map */
+                maplock.index = 1;
+                pxdlock = (struct pxd_lock *) & maplock;
+                pxdlock->flag = mlckFREEPXD;
+                PXDaddress(&pxdlock->pxd, xaddr);
+                PXDlength(&pxdlock->pxd, xlen);
+                txFreeMap(ip, pxdlock, NULL, COMMIT_WMAP);
+        }
+        /*
+         * free xtree/data (truncate to zero length):
+         * free xtree/data pages from cache, and
+         * free xtree/data blocks from working block map;
+         */
+        if (ip->i_size)
+                rc = xtTruncate(0, ip, 0, COMMIT_WMAP);
+        return rc;
+}
+/*
+ * NAME:        jfs_link(vp, dvp, name, crp)
+ *
+ * FUNCTION:    create a link to <vp> by the name = <name>
+ *              in the parent directory <dvp>
+ *
+ * PARAMETER:   vp      - target object
+ *              dvp     - parent directory of new link
+ *              name    - name of new link to target object
+ *              crp     - credential
+ *
+ * RETURN:      Errors from subroutines
+ *
+ * note:
+ * JFS does NOT support link() on directories (to prevent circular
+ * path in the directory hierarchy);
+ * EPERM: the target object is a directory, and either the caller
+ * does not have appropriate privileges or the implementation prohibits
+ * using link() on directories [XPG4.2].
+ *
+ * JFS does NOT support links between file systems:
+ * EXDEV: target object and new link are on different file systems and
+ * implementation does not support links between file systems [XPG4.2].
+ */
+static int jfs_link(struct dentry *old_dentry,
+             struct inode *dir, struct dentry *dentry)
+{
+        int rc;
+        tid_t tid;
+        struct inode *ip = old_dentry->d_inode;
+        ino_t ino;
+        struct component_name dname;
+        struct btstack btstack;
+        struct inode *iplist[2];
+        jfs_info("jfs_link: %s %s", old_dentry->d_name.name,
+                 dentry->d_name.name);
+        if (ip->i_nlink == JFS_LINK_MAX)
+                return -EMLINK;
+        if (ip->i_nlink == 0)
+                return -ENOENT;
+        tid = txBegin(ip->i_sb, 0);
+        down(&JFS_IP(dir)->commit_sem);
+        down(&JFS_IP(ip)->commit_sem);
+        /*
+         * scan parent directory for entry/freespace
+         */
+        if ((rc = get_UCSname(&dname, dentry)))
+                goto out;
+        if ((rc = dtSearch(dir, &dname, &ino, &btstack, JFS_CREATE)))
+                goto free_dname;
+        /*
+         * create entry for new link in parent directory
+         */
+        ino = ip->i_ino;
+        if ((rc = dtInsert(tid, dir, &dname, &ino, &btstack)))
+                goto free_dname;
+        /* update object inode */
+        ip->i_nlink++;          /* for new link */
+        ip->i_ctime = CURRENT_TIME;
+        mark_inode_dirty(dir);
+        atomic_inc(&ip->i_count);
+        iplist[0] = ip;
+        iplist[1] = dir;
+        rc = txCommit(tid, 2, &iplist[0], 0);
+        if (rc) {
+                ip->i_nlink--;
+                iput(ip);
+        } else
+                d_instantiate(dentry, ip);
+      free_dname:
+        free_UCSname(&dname);
+      out:
+        txEnd(tid);
+        up(&JFS_IP(dir)->commit_sem);
+        up(&JFS_IP(ip)->commit_sem);
+        jfs_info("jfs_link: rc:%d", rc);
+        return rc;
+}
+/*
+ * NAME:        jfs_symlink(dip, dentry, name)
+ *
+ * FUNCTION:    creates a symbolic link to <symlink> by name <name>
+ *                      in directory <dip>
+ *
+ * PARAMETER:   dip         - parent directory vnode
+ *                      dentry  - dentry of symbolic link
+ *                      name    - the path name of the existing object 
+ *                                    that will be the source of the link
+ *
+ * RETURN:      errors from subroutines
+ *
+ * note:
+ * ENAMETOOLONG: pathname resolution of a symbolic link produced
+ * an intermediate result whose length exceeds PATH_MAX [XPG4.2]
+*/
+static int jfs_symlink(struct inode *dip, struct dentry *dentry,
+                const char *name)
+{
+        int rc;
+        tid_t tid;
+        ino_t ino = 0;
+        struct component_name dname;
+        int ssize;              /* source pathname size */
+        struct btstack btstack;
+        struct inode *ip = dentry->d_inode;
+        unchar *i_fastsymlink;
+        s64 xlen = 0;
+        int bmask = 0, xsize;
+        s64 extent = 0, xaddr;
+        struct metapage *mp;
+        struct super_block *sb;
+        struct tblock *tblk;
+        struct inode *iplist[2];
+        jfs_info("jfs_symlink: dip:0x%p name:%s", dip, name);
+        ssize = strlen(name) + 1;
+        /*
+         * search parent directory for entry/freespace
+         * (dtSearch() returns parent directory page pinned)
+         */
+        if ((rc = get_UCSname(&dname, dentry)))
+                goto out1;
+        /*
+         * allocate on-disk/in-memory inode for symbolic link:
+         * (iAlloc() returns new, locked inode)
+         */
+        ip = ialloc(dip, S_IFLNK | 0777);
+        if (ip == NULL) {
+                rc = -ENOSPC;
+                goto out2;
+        }
+        tid = txBegin(dip->i_sb, 0);
+        down(&JFS_IP(dip)->commit_sem);
+        down(&JFS_IP(ip)->commit_sem);
+        tblk = tid_to_tblock(tid);
+        tblk->xflag |= COMMIT_CREATE;
+        tblk->ino = ip->i_ino;
+        tblk->u.ixpxd = JFS_IP(ip)->ixpxd;
+        /* fix symlink access permission
+         * (dir_create() ANDs in the u.u_cmask, 
+         * but symlinks really need to be 777 access)
+         */
+        ip->i_mode |= 0777;
+        /*
+         * write symbolic link target path name
+         */
+        xtInitRoot(tid, ip);
+        /*
+         * write source path name inline in on-disk inode (fast symbolic link)
+         */
+        if (ssize <= IDATASIZE) {
+                ip->i_op = &jfs_symlink_inode_operations;
+                i_fastsymlink = JFS_IP(ip)->i_inline;
+                memcpy(i_fastsymlink, name, ssize);
+                ip->i_size = ssize - 1;
+                /*
+                 * if symlink is > 128 bytes, we don't have the space to
+                 * store inline extended attributes
+                 */
+                if (ssize > sizeof (JFS_IP(ip)->i_inline))
+                        JFS_IP(ip)->mode2 &= ~INLINEEA;
+                jfs_info("jfs_symlink: fast symlink added  ssize:%d name:%s ",
+                         ssize, name);
+        }
+        /*
+         * write source path name in a single extent
+         */
+        else {
+                jfs_info("jfs_symlink: allocate extent ip:0x%p", ip);
+                ip->i_op = &page_symlink_inode_operations;
+                ip->i_mapping->a_ops = &jfs_aops;
+                /*
+                 * even though the data of symlink object (source 
+                 * path name) is treated as non-journaled user data,
+                 * it is read/written thru buffer cache for performance.
+                 */
+                sb = ip->i_sb;
+                bmask = JFS_SBI(sb)->bsize - 1;
+                xsize = (ssize + bmask) & ~bmask;
+                xaddr = 0;
+                xlen = xsize >> JFS_SBI(sb)->l2bsize;
+                if ((rc = xtInsert(tid, ip, 0, 0, xlen, &xaddr, 0))) {
+                        txAbort(tid, 0);
+                        rc = -ENOSPC;
+                        goto out3;
+                }
+                extent = xaddr;
+                ip->i_size = ssize - 1;
+                while (ssize) {
+                        /* This is kind of silly since PATH_MAX == 4K */
+                        int copy_size = min(ssize, PSIZE);
+                        mp = get_metapage(ip, xaddr, PSIZE, 1);
+                        if (mp == NULL) {
+                                xtTruncate(tid, ip, 0, COMMIT_PWMAP);
+                                rc = -EIO;
+                                txAbort(tid, 0);
+                                goto out3;
+                        }
+                        memcpy(mp->data, name, copy_size);
+                        flush_metapage(mp);
+                        ssize -= copy_size;
+                        name += copy_size;
+                        xaddr += JFS_SBI(sb)->nbperpage;
+                }
+        }
+        /*
+         * create entry for symbolic link in parent directory
+         */
+        rc = dtSearch(dip, &dname, &ino, &btstack, JFS_CREATE);
+        if (rc == 0) {
+                ino = ip->i_ino;
+                rc = dtInsert(tid, dip, &dname, &ino, &btstack);
+        }
+        if (rc) {
+                if (xlen)
+                        xtTruncate(tid, ip, 0, COMMIT_PWMAP);
+                txAbort(tid, 0);
+                /* discard new inode */
+                goto out3;
+        }
+        insert_inode_hash(ip);
+        mark_inode_dirty(ip);
+        /*
+         * commit update of parent directory and link object
+         */
+        iplist[0] = dip;
+        iplist[1] = ip;
+        rc = txCommit(tid, 2, &iplist[0], 0);
+      out3:
+        txEnd(tid);
+        up(&JFS_IP(dip)->commit_sem);
+        up(&JFS_IP(ip)->commit_sem);
+        if (rc) {
+                ip->i_nlink = 0;
+                iput(ip);
+        } else
+                d_instantiate(dentry, ip);
+      out2:
+        free_UCSname(&dname);
+#ifdef CONFIG_JFS_POSIX_ACL
+        if (rc == 0)
+                jfs_init_acl(ip, dip);
+#endif
+      out1:
+        jfs_info("jfs_symlink: rc:%d", rc);
+        return rc;
+}
+/*
+ * NAME:        jfs_rename
+ *
+ * FUNCTION:    rename a file or directory
+ */
+static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+               struct inode *new_dir, struct dentry *new_dentry)
+{
+        struct btstack btstack;
+        ino_t ino;
+        struct component_name new_dname;
+        struct inode *new_ip;
+        struct component_name old_dname;
+        struct inode *old_ip;
+        int rc;
+        tid_t tid;
+        struct tlock *tlck;
+        struct dt_lock *dtlck;
+        struct lv *lv;
+        int ipcount;
+        struct inode *iplist[4];
+        struct tblock *tblk;
+        s64 new_size = 0;
+        int commit_flag;
+        jfs_info("jfs_rename: %s %s", old_dentry->d_name.name,
+                 new_dentry->d_name.name);
+        old_ip = old_dentry->d_inode;
+        new_ip = new_dentry->d_inode;
+        if ((rc = get_UCSname(&old_dname, old_dentry)))
+                goto out1;
+        if ((rc = get_UCSname(&new_dname, new_dentry)))
+                goto out2;
+        /*
+         * Make sure source inode number is what we think it is
+         */
+        rc = dtSearch(old_dir, &old_dname, &ino, &btstack, JFS_LOOKUP);
+        if (rc || (ino != old_ip->i_ino)) {
+                rc = -ENOENT;
+                goto out3;
+        }
+        /*
+         * Make sure dest inode number (if any) is what we think it is
+         */
+        rc = dtSearch(new_dir, &new_dname, &ino, &btstack, JFS_LOOKUP);
+        if (rc == 0) {
+                if ((new_ip == 0) || (ino != new_ip->i_ino)) {
+                        rc = -ESTALE;
+                        goto out3;
+                }
+        } else if (rc != -ENOENT)
+                goto out3;
+        else if (new_ip) {
+                /* no entry exists, but one was expected */
+                rc = -ESTALE;
+                goto out3;
+        }
+        if (S_ISDIR(old_ip->i_mode)) {
+                if (new_ip) {
+                        if (!dtEmpty(new_ip)) {
+                                rc = -ENOTEMPTY;
+                                goto out3;
+                        }
+                } else if ((new_dir != old_dir) &&
+                           (new_dir->i_nlink == JFS_LINK_MAX)) {
+                        rc = -EMLINK;
+                        goto out3;
+                }
+        } else if (new_ip) {
+                IWRITE_LOCK(new_ip);
+                /* Init inode for quota operations. */
+                DQUOT_INIT(new_ip);
+        }
+        /*
+         * The real work starts here
+         */
+        tid = txBegin(new_dir->i_sb, 0);
+        down(&JFS_IP(new_dir)->commit_sem);
+        down(&JFS_IP(old_ip)->commit_sem);
+        if (old_dir != new_dir)
+                down(&JFS_IP(old_dir)->commit_sem);
+        if (new_ip) {
+                down(&JFS_IP(new_ip)->commit_sem);
+                /*
+                 * Change existing directory entry to new inode number
+                 */
+                ino = new_ip->i_ino;
+                rc = dtModify(tid, new_dir, &new_dname, &ino,
+                              old_ip->i_ino, JFS_RENAME);
+                if (rc)
+                        goto out4;
+                new_ip->i_nlink--;
+                if (S_ISDIR(new_ip->i_mode)) {
+                        new_ip->i_nlink--;
+                        if (new_ip->i_nlink) {
+                                up(&JFS_IP(new_dir)->commit_sem);
+                                up(&JFS_IP(old_ip)->commit_sem);
+                                if (old_dir != new_dir)
+                                        up(&JFS_IP(old_dir)->commit_sem);
+                                if (!S_ISDIR(old_ip->i_mode) && new_ip)
+                                        IWRITE_UNLOCK(new_ip);
+                                jfs_error(new_ip->i_sb,
+                                          "jfs_rename: new_ip->i_nlink != 0");
+                                return -EIO;
+                        }
+                        tblk = tid_to_tblock(tid);
+                        tblk->xflag |= COMMIT_DELETE;
+                        tblk->u.ip = new_ip;
+                } else if (new_ip->i_nlink == 0) {
+                        assert(!test_cflag(COMMIT_Nolink, new_ip));
+                        /* free block resources */
+                        if ((new_size = commitZeroLink(tid, new_ip)) < 0) {
+                                txAbort(tid, 1);        /* Marks FS Dirty */
+                                rc = new_size;          
+                                goto out4;
+                        }
+                        tblk = tid_to_tblock(tid);
+                        tblk->xflag |= COMMIT_DELETE;
+                        tblk->u.ip = new_ip;
+                } else {
+                        new_ip->i_ctime = CURRENT_TIME;
+                        mark_inode_dirty(new_ip);
+                }
+        } else {
+                /*
+                 * Add new directory entry
+                 */
+                rc = dtSearch(new_dir, &new_dname, &ino, &btstack,
+                              JFS_CREATE);
+                if (rc) {
+                        jfs_err("jfs_rename didn't expect dtSearch to fail "
+                                "w/rc = %d", rc);
+                        goto out4;
+                }
+                ino = old_ip->i_ino;
+                rc = dtInsert(tid, new_dir, &new_dname, &ino, &btstack);
+                if (rc) {
+                        if (rc == -EIO)
+                                jfs_err("jfs_rename: dtInsert returned -EIO");
+                        goto out4;
+                }
+                if (S_ISDIR(old_ip->i_mode))
+                        new_dir->i_nlink++;
+        }
+        /*
+         * Remove old directory entry
+         */
+        ino = old_ip->i_ino;
+        rc = dtDelete(tid, old_dir, &old_dname, &ino, JFS_REMOVE);
+        if (rc) {
+                jfs_err("jfs_rename did not expect dtDelete to return rc = %d",
+                        rc);
+                txAbort(tid, 1);        /* Marks Filesystem dirty */
+                goto out4;
+        }
+        if (S_ISDIR(old_ip->i_mode)) {
+                old_dir->i_nlink--;
+                if (old_dir != new_dir) {
+                        /*
+                         * Change inode number of parent for moved directory
+                         */
+                        JFS_IP(old_ip)->i_dtroot.header.idotdot =
+                                cpu_to_le32(new_dir->i_ino);
+                        /* Linelock header of dtree */
+                        tlck = txLock(tid, old_ip,
+                                    (struct metapage *) &JFS_IP(old_ip)->bxflag,
+                                      tlckDTREE | tlckBTROOT | tlckRELINK);
+                        dtlck = (struct dt_lock *) & tlck->lock;
+                        ASSERT(dtlck->index == 0);
+                        lv = & dtlck->lv[0];
+                        lv->offset = 0;
+                        lv->length = 1;
+                        dtlck->index++;
+                }
+        }
+        /*
+         * Update ctime on changed/moved inodes & mark dirty
+         */
+        old_ip->i_ctime = CURRENT_TIME;
+        mark_inode_dirty(old_ip);
+        new_dir->i_ctime = new_dir->i_mtime = current_fs_time(new_dir->i_sb);
+        mark_inode_dirty(new_dir);
+        /* Build list of inodes modified by this transaction */
+        ipcount = 0;
+        iplist[ipcount++] = old_ip;
+        if (new_ip)
+                iplist[ipcount++] = new_ip;
+        iplist[ipcount++] = old_dir;
+        if (old_dir != new_dir) {
+                iplist[ipcount++] = new_dir;
+                old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
+                mark_inode_dirty(old_dir);
+        }
+        /*
+         * Incomplete truncate of file data can
+         * result in timing problems unless we synchronously commit the
+         * transaction.
+         */
+        if (new_size)
+                commit_flag = COMMIT_SYNC;
+        else
+                commit_flag = 0;
+        rc = txCommit(tid, ipcount, iplist, commit_flag);
+      out4:
+        txEnd(tid);
+        up(&JFS_IP(new_dir)->commit_sem);
+        up(&JFS_IP(old_ip)->commit_sem);
+        if (old_dir != new_dir)
+                up(&JFS_IP(old_dir)->commit_sem);
+        if (new_ip)
+                up(&JFS_IP(new_ip)->commit_sem);
+        while (new_size && (rc == 0)) {
+                tid = txBegin(new_ip->i_sb, 0);
+                down(&JFS_IP(new_ip)->commit_sem);
+                new_size = xtTruncate_pmap(tid, new_ip, new_size);
+                if (new_size < 0) {
+                        txAbort(tid, 1);
+                        rc = new_size;          
+                } else
+                        rc = txCommit(tid, 1, &new_ip, COMMIT_SYNC);
+                txEnd(tid);
+                up(&JFS_IP(new_ip)->commit_sem);
+        }
+        if (new_ip && (new_ip->i_nlink == 0))
+                set_cflag(COMMIT_Nolink, new_ip);
+      out3:
+        free_UCSname(&new_dname);
+      out2:
+        free_UCSname(&old_dname);
+      out1:
+        if (new_ip && !S_ISDIR(new_ip->i_mode))
+                IWRITE_UNLOCK(new_ip);
+        /*
+         * Truncating the directory index table is not guaranteed.  It
+         * may need to be done iteratively
+         */
+        if (test_cflag(COMMIT_Stale, old_dir)) {
+                if (old_dir->i_size > 1)
+                        jfs_truncate_nolock(old_dir, 0);
+                clear_cflag(COMMIT_Stale, old_dir);
+        }
+        jfs_info("jfs_rename: returning %d", rc);
+        return rc;
+}
+/*
+ * NAME:        jfs_mknod
+ *
+ * FUNCTION:    Create a special file (device)
+ */
+static int jfs_mknod(struct inode *dir, struct dentry *dentry,
+                int mode, dev_t rdev)
+{
+        struct jfs_inode_info *jfs_ip;
+        struct btstack btstack;
+        struct component_name dname;
+        ino_t ino;
+        struct inode *ip;
+        struct inode *iplist[2];
+        int rc;
+        tid_t tid;
+        struct tblock *tblk;
+        if (!new_valid_dev(rdev))
+                return -EINVAL;
+        jfs_info("jfs_mknod: %s", dentry->d_name.name);
+        if ((rc = get_UCSname(&dname, dentry)))
+                goto out;
+        ip = ialloc(dir, mode);
+        if (ip == NULL) {
+                rc = -ENOSPC;
+                goto out1;
+        }
+        jfs_ip = JFS_IP(ip);
+        tid = txBegin(dir->i_sb, 0);
+        down(&JFS_IP(dir)->commit_sem);
+        down(&JFS_IP(ip)->commit_sem);
+        if ((rc = dtSearch(dir, &dname, &ino, &btstack, JFS_CREATE)))
+                goto out3;
+        tblk = tid_to_tblock(tid);
+        tblk->xflag |= COMMIT_CREATE;
+        tblk->ino = ip->i_ino;
+        tblk->u.ixpxd = JFS_IP(ip)->ixpxd;
+        ino = ip->i_ino;
+        if ((rc = dtInsert(tid, dir, &dname, &ino, &btstack)))
+                goto out3;
+        ip->i_op = &jfs_file_inode_operations;
+        jfs_ip->dev = new_encode_dev(rdev);
+        init_special_inode(ip, ip->i_mode, rdev);
+        insert_inode_hash(ip);
+        mark_inode_dirty(ip);
+        dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+        mark_inode_dirty(dir);
+        iplist[0] = dir;
+        iplist[1] = ip;
+        rc = txCommit(tid, 2, iplist, 0);
+      out3:
+        txEnd(tid);
+        up(&JFS_IP(ip)->commit_sem);
+        up(&JFS_IP(dir)->commit_sem);
+        if (rc) {
+                ip->i_nlink = 0;
+                iput(ip);
+        } else
+                d_instantiate(dentry, ip);
+      out1:
+        free_UCSname(&dname);
+#ifdef CONFIG_JFS_POSIX_ACL
+        if (rc == 0)
+                jfs_init_acl(ip, dir);
+#endif
+      out:
+        jfs_info("jfs_mknod: returning %d", rc);
+        return rc;
+}
+static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struct nameidata *nd)
+{
+        struct btstack btstack;
+        ino_t inum;
+        struct inode *ip;
+        struct component_name key;
+        const char *name = dentry->d_name.name;
+        int len = dentry->d_name.len;
+        int rc;
+        jfs_info("jfs_lookup: name = %s", name);
+        if ((name[0] == '.') && (len == 1))
+                inum = dip->i_ino;
+        else if (strcmp(name, "..") == 0)
+                inum = PARENT(dip);
+        else {
+                if ((rc = get_UCSname(&key, dentry)))
+                        return ERR_PTR(rc);
+                rc = dtSearch(dip, &key, &inum, &btstack, JFS_LOOKUP);
+                free_UCSname(&key);
+                if (rc == -ENOENT) {
+                        d_add(dentry, NULL);
+                        return ERR_PTR(0);
+                } else if (rc) {
+                        jfs_err("jfs_lookup: dtSearch returned %d", rc);
+                        return ERR_PTR(rc);
+                }
+        }
+        ip = iget(dip->i_sb, inum);
+        if (ip == NULL || is_bad_inode(ip)) {
+                jfs_err("jfs_lookup: iget failed on inum %d", (uint) inum);
+                if (ip)
+                        iput(ip);
+                return ERR_PTR(-EACCES);
+        }
+        if (JFS_SBI(dip->i_sb)->mntflag & JFS_OS2)
+                dentry->d_op = &jfs_ci_dentry_operations;
+        dentry = d_splice_alias(ip, dentry);
+        if (dentry && (JFS_SBI(dip->i_sb)->mntflag & JFS_OS2))
+                dentry->d_op = &jfs_ci_dentry_operations;
+        return dentry;
+}
+struct dentry *jfs_get_parent(struct dentry *dentry)
+{
+        struct super_block *sb = dentry->d_inode->i_sb;
+        struct dentry *parent = ERR_PTR(-ENOENT);
+        struct inode *inode;
+        unsigned long parent_ino;
+        parent_ino =
+                le32_to_cpu(JFS_IP(dentry->d_inode)->i_dtroot.header.idotdot);
+        inode = iget(sb, parent_ino);
+        if (inode) {
+                if (is_bad_inode(inode)) {
+                        iput(inode);
+                        parent = ERR_PTR(-EACCES);
+                } else {
+                        parent = d_alloc_anon(inode);
+                        if (!parent) {
+                                parent = ERR_PTR(-ENOMEM);
+                                iput(inode);
+                        }
+                }
+        }
+        return parent;
+}
+struct inode_operations jfs_dir_inode_operations = {
+        .create         = jfs_create,
+        .lookup         = jfs_lookup,
+        .link           = jfs_link,
+        .unlink         = jfs_unlink,
+        .symlink        = jfs_symlink,
+        .mkdir          = jfs_mkdir,
+        .rmdir          = jfs_rmdir,
+        .mknod          = jfs_mknod,
+        .rename         = jfs_rename,
+        .setxattr       = jfs_setxattr,
+        .getxattr       = jfs_getxattr,
+        .listxattr      = jfs_listxattr,
+        .removexattr    = jfs_removexattr,
+#ifdef CONFIG_JFS_POSIX_ACL
+        .setattr        = jfs_setattr,
+        .permission     = jfs_permission,
+#endif
+};
+struct file_operations jfs_dir_operations = {
+        .read           = generic_read_dir,
+        .readdir        = jfs_readdir,
+        .fsync          = jfs_fsync,
+};
+static int jfs_ci_hash(struct dentry *dir, struct qstr *this)
+{
+        unsigned long hash;
+        int i;
+        hash = init_name_hash();
+        for (i=0; i < this->len; i++)
+                hash = partial_name_hash(tolower(this->name[i]), hash);
+        this->hash = end_name_hash(hash);
+        return 0;
+}
+static int jfs_ci_compare(struct dentry *dir, struct qstr *a, struct qstr *b)
+{
+        int i, result = 1;
+        if (a->len != b->len)
+                goto out;
+        for (i=0; i < a->len; i++) {
+                if (tolower(a->name[i]) != tolower(b->name[i]))
+                        goto out;
+        }
+        result = 0;
+        /*
+         * We want creates to preserve case.  A negative dentry, a, that
+         * has a different case than b may cause a new entry to be created
+         * with the wrong case.  Since we can't tell if a comes from a negative
+         * dentry, we blindly replace it with b.  This should be harmless if
+         * a is not a negative dentry.
+         */
+        memcpy((unsigned char *)a->name, b->name, a->len);
+out:
+        return result;
+}
+struct dentry_operations jfs_ci_dentry_operations =
+{
+        .d_hash = jfs_ci_hash,
+        .d_compare = jfs_ci_compare,
+};
diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c
new file mode 100644
index 000000000000..2eb6869b6e72
--- /dev/null
+++ b/fs/jfs/resize.c
@@ -0,0 +1,537 @@
+/*
+ *   Copyright (C) International Business Machines  Corp., 2000-2004
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/quotaops.h>
+#include "jfs_incore.h"
+#include "jfs_filsys.h"
+#include "jfs_metapage.h"
+#include "jfs_dinode.h"
+#include "jfs_imap.h"
+#include "jfs_dmap.h"
+#include "jfs_superblock.h"
+#include "jfs_txnmgr.h"
+#include "jfs_debug.h"
+#define BITSPERPAGE     (PSIZE << 3)
+#define L2MEGABYTE      20
+#define MEGABYTE        (1 << L2MEGABYTE)
+#define MEGABYTE32     (MEGABYTE << 5)
+/* convert block number to bmap file page number */
+#define BLKTODMAPN(b)\
+        (((b) >> 13) + ((b) >> 23) + ((b) >> 33) + 3 + 1)
+/*
+ *      jfs_extendfs()
+ *
+ * function: extend file system;
+ *
+ *   |-------------------------------|----------|----------|
+ *   file system space               fsck       inline log
+ *                                   workspace  space
+ *
+ * input:
+ *      new LVSize: in LV blocks (required)
+ *      new LogSize: in LV blocks (optional)
+ *      new FSSize: in LV blocks (optional)
+ *
+ * new configuration:
+ * 1. set new LogSize as specified or default from new LVSize;
+ * 2. compute new FSCKSize from new LVSize;
+ * 3. set new FSSize as MIN(FSSize, LVSize-(LogSize+FSCKSize)) where
+ *    assert(new FSSize >= old FSSize),
+ *    i.e., file system must not be shrinked;
+ */
+int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
+{
+        int rc = 0;
+        struct jfs_sb_info *sbi = JFS_SBI(sb);
+        struct inode *ipbmap = sbi->ipbmap;
+        struct inode *ipbmap2;
+        struct inode *ipimap = sbi->ipimap;
+        struct jfs_log *log = sbi->log;
+        struct bmap *bmp = sbi->bmap;
+        s64 newLogAddress, newFSCKAddress;
+        int newFSCKSize;
+        s64 newMapSize = 0, mapSize;
+        s64 XAddress, XSize, nblocks, xoff, xaddr, t64;
+        s64 oldLVSize;
+        s64 newFSSize;
+        s64 VolumeSize;
+        int newNpages = 0, nPages, newPage, xlen, t32;
+        int tid;
+        int log_formatted = 0;
+        struct inode *iplist[1];
+        struct jfs_superblock *j_sb, *j_sb2;
+        uint old_agsize;
+        struct buffer_head *bh, *bh2;
+        /* If the volume hasn't grown, get out now */
+        if (sbi->mntflag & JFS_INLINELOG)
+                oldLVSize = addressPXD(&sbi->logpxd) + lengthPXD(&sbi->logpxd);
+        else
+                oldLVSize = addressPXD(&sbi->fsckpxd) +
+                    lengthPXD(&sbi->fsckpxd);
+        if (oldLVSize >= newLVSize) {
+                printk(KERN_WARNING
+                       "jfs_extendfs: volume hasn't grown, returning\n");
+                goto out;
+        }
+        VolumeSize = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
+        if (VolumeSize) {
+                if (newLVSize > VolumeSize) {
+                        printk(KERN_WARNING "jfs_extendfs: invalid size\n");
+                        rc = -EINVAL;
+                        goto out;
+                }
+        } else {
+                /* check the device */
+                bh = sb_bread(sb, newLVSize - 1);
+                if (!bh) {
+                        printk(KERN_WARNING "jfs_extendfs: invalid size\n");
+                        rc = -EINVAL;
+                        goto out;
+                }
+                bforget(bh);
+        }
+        /* Can't extend write-protected drive */
+        if (isReadOnly(ipbmap)) {
+                printk(KERN_WARNING "jfs_extendfs: read-only file system\n");
+                rc = -EROFS;
+                goto out;
+        }
+        /*
+         *      reconfigure LV spaces
+         *      ---------------------
+         *
+         * validate new size, or, if not specified, determine new size
+         */
+        /*
+         * reconfigure inline log space:
+         */
+        if ((sbi->mntflag & JFS_INLINELOG)) {
+                if (newLogSize == 0) {
+                        /*
+                         * no size specified: default to 1/256 of aggregate
+                         * size; rounded up to a megabyte boundary;
+                         */
+                        newLogSize = newLVSize >> 8;
+                        t32 = (1 << (20 - sbi->l2bsize)) - 1;
+                        newLogSize = (newLogSize + t32) & ~t32;
+                        newLogSize =
+                            min(newLogSize, MEGABYTE32 >> sbi->l2bsize);
+                } else {
+                        /*
+                         * convert the newLogSize to fs blocks.
+                         *
+                         * Since this is given in megabytes, it will always be
+                         * an even number of pages.
+                         */
+                        newLogSize = (newLogSize * MEGABYTE) >> sbi->l2bsize;
+                }
+        } else
+                newLogSize = 0;
+        newLogAddress = newLVSize - newLogSize;
+        /*
+         * reconfigure fsck work space:
+         *
+         * configure it to the end of the logical volume regardless of
+         * whether file system extends to the end of the aggregate;
+         * Need enough 4k pages to cover:
+         *  - 1 bit per block in aggregate rounded up to BPERDMAP boundary
+         *  - 1 extra page to handle control page and intermediate level pages
+         *  - 50 extra pages for the chkdsk service log
+         */
+        t64 = ((newLVSize - newLogSize + BPERDMAP - 1) >> L2BPERDMAP)
+            << L2BPERDMAP;
+        t32 = ((t64 + (BITSPERPAGE - 1)) / BITSPERPAGE) + 1 + 50;
+        newFSCKSize = t32 << sbi->l2nbperpage;
+        newFSCKAddress = newLogAddress - newFSCKSize;
+        /*
+         * compute new file system space;
+         */
+        newFSSize = newLVSize - newLogSize - newFSCKSize;
+        /* file system cannot be shrinked */
+        if (newFSSize < bmp->db_mapsize) {
+                rc = -EINVAL;
+                goto out;
+        }
+        /*
+         * If we're expanding enough that the inline log does not overlap
+         * the old one, we can format the new log before we quiesce the
+         * filesystem.
+         */
+        if ((sbi->mntflag & JFS_INLINELOG) && (newLogAddress > oldLVSize)) {
+                if ((rc = lmLogFormat(log, newLogAddress, newLogSize)))
+                        goto out;
+                log_formatted = 1;
+        }
+        /*
+         *      quiesce file system
+         *
+         * (prepare to move the inline log and to prevent map update)
+         *
+         * block any new transactions and wait for completion of
+         * all wip transactions and flush modified pages s.t.
+         * on-disk file system is in consistent state and
+         * log is not required for recovery.
+         */
+        txQuiesce(sb);
+        if (sbi->mntflag & JFS_INLINELOG) {
+                /*
+                 * deactivate old inline log
+                 */
+                lmLogShutdown(log);
+                /*
+                 * mark on-disk super block for fs in transition;
+                 *
+                 * update on-disk superblock for the new space configuration
+                 * of inline log space and fsck work space descriptors:
+                 * N.B. FS descriptor is NOT updated;
+                 *
+                 * crash recovery:
+                 * logredo(): if FM_EXTENDFS, return to fsck() for cleanup;
+                 * fsck(): if FM_EXTENDFS, reformat inline log and fsck
+                 * workspace from superblock inline log descriptor and fsck
+                 * workspace descriptor;
+                 */
+                /* read in superblock */
+                if ((rc = readSuper(sb, &bh)))
+                        goto error_out;
+                j_sb = (struct jfs_superblock *)bh->b_data;
+                /* mark extendfs() in progress */
+                j_sb->s_state |= cpu_to_le32(FM_EXTENDFS);
+                j_sb->s_xsize = cpu_to_le64(newFSSize);
+                PXDaddress(&j_sb->s_xfsckpxd, newFSCKAddress);
+                PXDlength(&j_sb->s_xfsckpxd, newFSCKSize);
+                PXDaddress(&j_sb->s_xlogpxd, newLogAddress);
+                PXDlength(&j_sb->s_xlogpxd, newLogSize);
+                /* synchronously update superblock */
+                mark_buffer_dirty(bh);
+                sync_dirty_buffer(bh);
+                brelse(bh);
+                /*
+                 * format new inline log synchronously;
+                 *
+                 * crash recovery: if log move in progress,
+                 * reformat log and exit success;
+                 */
+                if (!log_formatted)
+                        if ((rc = lmLogFormat(log, newLogAddress, newLogSize)))
+                                goto error_out;
+                /*
+                 * activate new log
+                 */
+                log->base = newLogAddress;
+                log->size = newLogSize >> (L2LOGPSIZE - sb->s_blocksize_bits);
+                if ((rc = lmLogInit(log)))
+                        goto error_out;
+        }
+        /*
+         *      extend block allocation map
+         *      ---------------------------
+         *
+         * extendfs() for new extension, retry after crash recovery;
+         *
+         * note: both logredo() and fsck() rebuild map from
+         * the bitmap and configuration parameter from superblock
+         * (disregarding all other control information in the map);
+         *
+         * superblock:
+         *  s_size: aggregate size in physical blocks;
+         */
+        /*
+         *      compute the new block allocation map configuration
+         *
+         * map dinode:
+         *  di_size: map file size in byte;
+         *  di_nblocks: number of blocks allocated for map file;
+         *  di_mapsize: number of blocks in aggregate (covered by map);
+         * map control page:
+         *  db_mapsize: number of blocks in aggregate (covered by map);
+         */
+        newMapSize = newFSSize;
+        /* number of data pages of new bmap file:
+         * roundup new size to full dmap page boundary and
+         * add 1 extra dmap page for next extendfs()
+         */
+        t64 = (newMapSize - 1) + BPERDMAP;
+        newNpages = BLKTODMAPN(t64) + 1;
+        /*
+         *      extend map from current map (WITHOUT growing mapfile)
+         *
+         * map new extension with unmapped part of the last partial
+         * dmap page, if applicable, and extra page(s) allocated
+         * at end of bmap by mkfs() or previous extendfs();
+         */
+      extendBmap:
+        /* compute number of blocks requested to extend */
+        mapSize = bmp->db_mapsize;
+        XAddress = mapSize;     /* eXtension Address */
+        XSize = newMapSize - mapSize;   /* eXtension Size */
+        old_agsize = bmp->db_agsize;    /* We need to know if this changes */
+        /* compute number of blocks that can be extended by current mapfile */
+        t64 = dbMapFileSizeToMapSize(ipbmap);
+        if (mapSize > t64) {
+                printk(KERN_ERR "jfs_extendfs: mapSize (0x%Lx) > t64 (0x%Lx)\n",
+                       (long long) mapSize, (long long) t64);
+                rc = -EIO;
+                goto error_out;
+        }
+        nblocks = min(t64 - mapSize, XSize);
+        /*
+         * update map pages for new extension:
+         *
+         * update/init dmap and bubble up the control hierarchy
+         * incrementally fold up dmaps into upper levels;
+         * update bmap control page;
+         */
+        if ((rc = dbExtendFS(ipbmap, XAddress, nblocks)))
+                goto error_out;
+        /*
+         * the map now has extended to cover additional nblocks:
+         * dn_mapsize = oldMapsize + nblocks;
+         */
+        /* ipbmap->i_mapsize += nblocks; */
+        XSize -= nblocks;
+        /*
+         *      grow map file to cover remaining extension
+         *      and/or one extra dmap page for next extendfs();
+         *
+         * allocate new map pages and its backing blocks, and
+         * update map file xtree
+         */
+        /* compute number of data pages of current bmap file */
+        nPages = ipbmap->i_size >> L2PSIZE;
+        /* need to grow map file ? */
+        if (nPages == newNpages)
+                goto finalizeBmap;
+        /*
+         * grow bmap file for the new map pages required:
+         *
+         * allocate growth at the start of newly extended region;
+         * bmap file only grows sequentially, i.e., both data pages
+         * and possibly xtree index pages may grow in append mode,
+         * s.t. logredo() can reconstruct pre-extension state
+         * by washing away bmap file of pages outside s_size boundary;
+         */
+        /*
+         * journal map file growth as if a regular file growth:
+         * (note: bmap is created with di_mode = IFJOURNAL|IFREG);
+         *
+         * journaling of bmap file growth is not required since
+         * logredo() do/can not use log records of bmap file growth
+         * but it provides careful write semantics, pmap update, etc.;
+         */
+        /* synchronous write of data pages: bmap data pages are
+         * cached in meta-data cache, and not written out
+         * by txCommit();
+         */
+        filemap_fdatawait(ipbmap->i_mapping);
+        filemap_fdatawrite(ipbmap->i_mapping);
+        filemap_fdatawait(ipbmap->i_mapping);
+        diWriteSpecial(ipbmap, 0);
+        newPage = nPages;       /* first new page number */
+        xoff = newPage << sbi->l2nbperpage;
+        xlen = (newNpages - nPages) << sbi->l2nbperpage;
+        xlen = min(xlen, (int) nblocks) & ~(sbi->nbperpage - 1);
+        xaddr = XAddress;
+        tid = txBegin(sb, COMMIT_FORCE);
+        if ((rc = xtAppend(tid, ipbmap, 0, xoff, nblocks, &xlen, &xaddr, 0))) {
+                txEnd(tid);
+                goto error_out;
+        }
+        /* update bmap file size */
+        ipbmap->i_size += xlen << sbi->l2bsize;
+        inode_add_bytes(ipbmap, xlen << sbi->l2bsize);
+        iplist[0] = ipbmap;
+        rc = txCommit(tid, 1, &iplist[0], COMMIT_FORCE);
+        txEnd(tid);
+        if (rc)
+                goto error_out;
+        /*
+         * map file has been grown now to cover extension to further out;
+         * di_size = new map file size;
+         *
+         * if huge extension, the previous extension based on previous
+         * map file size may not have been sufficient to cover whole extension
+         * (it could have been used up for new map pages),
+         * but the newly grown map file now covers lot bigger new free space
+         * available for further extension of map;
+         */
+        /* any more blocks to extend ? */
+        if (XSize)
+                goto extendBmap;
+      finalizeBmap:
+        /* finalize bmap */
+        dbFinalizeBmap(ipbmap);
+        /*
+         *      update inode allocation map
+         *      ---------------------------
+         *
+         * move iag lists from old to new iag;
+         * agstart field is not updated for logredo() to reconstruct
+         * iag lists if system crash occurs.
+         * (computation of ag number from agstart based on agsize
+         * will correctly identify the new ag);
+         */
+        /* if new AG size the same as old AG size, done! */
+        if (bmp->db_agsize != old_agsize) {
+                if ((rc = diExtendFS(ipimap, ipbmap)))
+                        goto error_out;
+                /* finalize imap */
+                if ((rc = diSync(ipimap)))
+                        goto error_out;
+        }
+        /*
+         *      finalize
+         *      --------
+         *
+         * extension is committed when on-disk super block is
+         * updated with new descriptors: logredo will recover
+         * crash before it to pre-extension state;
+         */
+        /* sync log to skip log replay of bmap file growth transaction; */
+        /* lmLogSync(log, 1); */
+        /*
+         * synchronous write bmap global control page;
+         * for crash before completion of write
+         * logredo() will recover to pre-extendfs state;
+         * for crash after completion of write,
+         * logredo() will recover post-extendfs state;
+         */
+        if ((rc = dbSync(ipbmap)))
+                goto error_out;
+        /*
+         * copy primary bmap inode to secondary bmap inode
+         */
+        ipbmap2 = diReadSpecial(sb, BMAP_I, 1);
+        if (ipbmap2 == NULL) {
+                printk(KERN_ERR "jfs_extendfs: diReadSpecial(bmap) failed\n");
+                goto error_out;
+        }
+        memcpy(&JFS_IP(ipbmap2)->i_xtroot, &JFS_IP(ipbmap)->i_xtroot, 288);
+        ipbmap2->i_size = ipbmap->i_size;
+        ipbmap2->i_blocks = ipbmap->i_blocks;
+        diWriteSpecial(ipbmap2, 1);
+        diFreeSpecial(ipbmap2);
+        /*
+         *      update superblock
+         */
+        if ((rc = readSuper(sb, &bh)))
+                goto error_out;
+        j_sb = (struct jfs_superblock *)bh->b_data;
+        /* mark extendfs() completion */
+        j_sb->s_state &= cpu_to_le32(~FM_EXTENDFS);
+        j_sb->s_size = cpu_to_le64(bmp->db_mapsize <<
+                                   le16_to_cpu(j_sb->s_l2bfactor));
+        j_sb->s_agsize = cpu_to_le32(bmp->db_agsize);
+        /* update inline log space descriptor */
+        if (sbi->mntflag & JFS_INLINELOG) {
+                PXDaddress(&(j_sb->s_logpxd), newLogAddress);
+                PXDlength(&(j_sb->s_logpxd), newLogSize);
+        }
+        /* record log's mount serial number */
+        j_sb->s_logserial = cpu_to_le32(log->serial);
+        /* update fsck work space descriptor */
+        PXDaddress(&(j_sb->s_fsckpxd), newFSCKAddress);
+        PXDlength(&(j_sb->s_fsckpxd), newFSCKSize);
+        j_sb->s_fscklog = 1;
+        /* sb->s_fsckloglen remains the same */
+        /* Update secondary superblock */
+        bh2 = sb_bread(sb, SUPER2_OFF >> sb->s_blocksize_bits);
+        if (bh2) {
+                j_sb2 = (struct jfs_superblock *)bh2->b_data;
+                memcpy(j_sb2, j_sb, sizeof (struct jfs_superblock));
+                mark_buffer_dirty(bh);
+                sync_dirty_buffer(bh2);
+                brelse(bh2);
+        }
+        /* write primary superblock */
+        mark_buffer_dirty(bh);
+        sync_dirty_buffer(bh);
+        brelse(bh);
+        goto resume;
+      error_out:
+        jfs_error(sb, "jfs_extendfs");
+      resume:
+        /*
+         *      resume file system transactions
+         */
+        txResume(sb);
+      out:
+        return rc;
+}
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
new file mode 100644
index 000000000000..5856866e24fc
--- /dev/null
+++ b/fs/jfs/super.c
@@ -0,0 +1,700 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2004
+ *   Portions Copyright (C) Christoph Hellwig, 2001-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/fs.h>
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/parser.h>
+#include <linux/completion.h>
+#include <linux/vfs.h>
+#include <linux/moduleparam.h>
+#include <asm/uaccess.h>
+#include "jfs_incore.h"
+#include "jfs_filsys.h"
+#include "jfs_metapage.h"
+#include "jfs_superblock.h"
+#include "jfs_dmap.h"
+#include "jfs_imap.h"
+#include "jfs_acl.h"
+#include "jfs_debug.h"
+MODULE_DESCRIPTION("The Journaled Filesystem (JFS)");
+MODULE_AUTHOR("Steve Best/Dave Kleikamp/Barry Arndt, IBM");
+MODULE_LICENSE("GPL");
+static kmem_cache_t * jfs_inode_cachep;
+static struct super_operations jfs_super_operations;
+static struct export_operations jfs_export_operations;
+static struct file_system_type jfs_fs_type;
+#define MAX_COMMIT_THREADS 64
+static int commit_threads = 0;
+module_param(commit_threads, int, 0);
+MODULE_PARM_DESC(commit_threads, "Number of commit threads");
+int jfs_stop_threads;
+static pid_t jfsIOthread;
+static pid_t jfsCommitThread[MAX_COMMIT_THREADS];
+static pid_t jfsSyncThread;
+DECLARE_COMPLETION(jfsIOwait);
+#ifdef CONFIG_JFS_DEBUG
+int jfsloglevel = JFS_LOGLEVEL_WARN;
+module_param(jfsloglevel, int, 0644);
+MODULE_PARM_DESC(jfsloglevel, "Specify JFS loglevel (0, 1 or 2)");
+#endif
+/*
+ * External declarations
+ */
+extern int jfs_mount(struct super_block *);
+extern int jfs_mount_rw(struct super_block *, int);
+extern int jfs_umount(struct super_block *);
+extern int jfs_umount_rw(struct super_block *);
+extern int jfsIOWait(void *);
+extern int jfs_lazycommit(void *);
+extern int jfs_sync(void *);
+extern void jfs_read_inode(struct inode *inode);
+extern void jfs_dirty_inode(struct inode *inode);
+extern void jfs_delete_inode(struct inode *inode);
+extern int jfs_write_inode(struct inode *inode, int wait);
+extern struct dentry *jfs_get_parent(struct dentry *dentry);
+extern int jfs_extendfs(struct super_block *, s64, int);
+extern struct dentry_operations jfs_ci_dentry_operations;
+#ifdef PROC_FS_JFS              /* see jfs_debug.h */
+extern void jfs_proc_init(void);
+extern void jfs_proc_clean(void);
+#endif
+extern wait_queue_head_t jfs_IO_thread_wait;
+extern wait_queue_head_t jfs_commit_thread_wait;
+extern wait_queue_head_t jfs_sync_thread_wait;
+static void jfs_handle_error(struct super_block *sb)
+{
+        struct jfs_sb_info *sbi = JFS_SBI(sb);
+        if (sb->s_flags & MS_RDONLY)
+                return;
+        updateSuper(sb, FM_DIRTY);
+        if (sbi->flag & JFS_ERR_PANIC)
+                panic("JFS (device %s): panic forced after error\n",
+                        sb->s_id);
+        else if (sbi->flag & JFS_ERR_REMOUNT_RO) {
+                jfs_err("ERROR: (device %s): remounting filesystem "
+                        "as read-only\n",
+                        sb->s_id);
+                sb->s_flags |= MS_RDONLY;
+        } 
+        /* nothing is done for continue beyond marking the superblock dirty */
+}
+void jfs_error(struct super_block *sb, const char * function, ...)
+{
+        static char error_buf[256];
+        va_list args;
+        va_start(args, function);
+        vsprintf(error_buf, function, args);
+        va_end(args);
+        printk(KERN_ERR "ERROR: (device %s): %s\n", sb->s_id, error_buf);
+        jfs_handle_error(sb);
+}
+static struct inode *jfs_alloc_inode(struct super_block *sb)
+{
+        struct jfs_inode_info *jfs_inode;
+        jfs_inode = kmem_cache_alloc(jfs_inode_cachep, GFP_NOFS);
+        if (!jfs_inode)
+                return NULL;
+        return &jfs_inode->vfs_inode;
+}
+static void jfs_destroy_inode(struct inode *inode)
+{
+        struct jfs_inode_info *ji = JFS_IP(inode);
+        spin_lock_irq(&ji->ag_lock);
+        if (ji->active_ag != -1) {
+                struct bmap *bmap = JFS_SBI(inode->i_sb)->bmap;
+                atomic_dec(&bmap->db_active[ji->active_ag]);
+                ji->active_ag = -1;
+        }
+        spin_unlock_irq(&ji->ag_lock);
+#ifdef CONFIG_JFS_POSIX_ACL
+        if (ji->i_acl != JFS_ACL_NOT_CACHED) {
+                posix_acl_release(ji->i_acl);
+                ji->i_acl = JFS_ACL_NOT_CACHED;
+        }
+        if (ji->i_default_acl != JFS_ACL_NOT_CACHED) {
+                posix_acl_release(ji->i_default_acl);
+                ji->i_default_acl = JFS_ACL_NOT_CACHED;
+        }
+#endif
+        kmem_cache_free(jfs_inode_cachep, ji);
+}
+static int jfs_statfs(struct super_block *sb, struct kstatfs *buf)
+{
+        struct jfs_sb_info *sbi = JFS_SBI(sb);
+        s64 maxinodes;
+        struct inomap *imap = JFS_IP(sbi->ipimap)->i_imap;
+        jfs_info("In jfs_statfs");
+        buf->f_type = JFS_SUPER_MAGIC;
+        buf->f_bsize = sbi->bsize;
+        buf->f_blocks = sbi->bmap->db_mapsize;
+        buf->f_bfree = sbi->bmap->db_nfree;
+        buf->f_bavail = sbi->bmap->db_nfree;
+        /*
+         * If we really return the number of allocated & free inodes, some
+         * applications will fail because they won't see enough free inodes.
+         * We'll try to calculate some guess as to how may inodes we can
+         * really allocate
+         *
+         * buf->f_files = atomic_read(&imap->im_numinos);
+         * buf->f_ffree = atomic_read(&imap->im_numfree);
+         */
+        maxinodes = min((s64) atomic_read(&imap->im_numinos) +
+                        ((sbi->bmap->db_nfree >> imap->im_l2nbperiext)
+                         << L2INOSPEREXT), (s64) 0xffffffffLL);
+        buf->f_files = maxinodes;
+        buf->f_ffree = maxinodes - (atomic_read(&imap->im_numinos) -
+                                    atomic_read(&imap->im_numfree));
+        buf->f_namelen = JFS_NAME_MAX;
+        return 0;
+}
+static void jfs_put_super(struct super_block *sb)
+{
+        struct jfs_sb_info *sbi = JFS_SBI(sb);
+        int rc;
+        jfs_info("In jfs_put_super");
+        rc = jfs_umount(sb);
+        if (rc)
+                jfs_err("jfs_umount failed with return code %d", rc);
+        if (sbi->nls_tab)
+                unload_nls(sbi->nls_tab);
+        sbi->nls_tab = NULL;
+        kfree(sbi);
+}
+enum {
+        Opt_integrity, Opt_nointegrity, Opt_iocharset, Opt_resize,
+        Opt_resize_nosize, Opt_errors, Opt_ignore, Opt_err,
+};
+static match_table_t tokens = {
+        {Opt_integrity, "integrity"},
+        {Opt_nointegrity, "nointegrity"},
+        {Opt_iocharset, "iocharset=%s"},
+        {Opt_resize, "resize=%u"},
+        {Opt_resize_nosize, "resize"},
+        {Opt_errors, "errors=%s"},
+        {Opt_ignore, "noquota"},
+        {Opt_ignore, "quota"},
+        {Opt_ignore, "usrquota"},
+        {Opt_ignore, "grpquota"},
+        {Opt_err, NULL}
+};
+static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
+                         int *flag)
+{
+        void *nls_map = (void *)-1;     /* -1: no change;  NULL: none */
+        char *p;
+        struct jfs_sb_info *sbi = JFS_SBI(sb);
+        *newLVSize = 0;
+        if (!options)
+                return 1;
+        while ((p = strsep(&options, ",")) != NULL) {
+                substring_t args[MAX_OPT_ARGS];
+                int token;
+                if (!*p)
+                        continue;
+                token = match_token(p, tokens, args);
+                switch (token) {
+                case Opt_integrity:
+                        *flag &= ~JFS_NOINTEGRITY;
+                        break;
+                case Opt_nointegrity:
+                        *flag |= JFS_NOINTEGRITY;
+                        break;
+                case Opt_ignore:
+                        /* Silently ignore the quota options */
+                        /* Don't do anything ;-) */
+                        break;
+                case Opt_iocharset:
+                        if (nls_map && nls_map != (void *) -1)
+                                unload_nls(nls_map);
+                        if (!strcmp(args[0].from, "none"))
+                                nls_map = NULL;
+                        else {
+                                nls_map = load_nls(args[0].from);
+                                if (!nls_map) {
+                                        printk(KERN_ERR
+                                               "JFS: charset not found\n");
+                                        goto cleanup;
+                                }
+                        }
+                        break;
+                case Opt_resize:
+                {
+                        char *resize = args[0].from;
+                        *newLVSize = simple_strtoull(resize, &resize, 0);
+                        break;
+                }
+                case Opt_resize_nosize:
+                {
+                        *newLVSize = sb->s_bdev->bd_inode->i_size >>
+                                sb->s_blocksize_bits;
+                        if (*newLVSize == 0)
+                                printk(KERN_ERR
+                                       "JFS: Cannot determine volume size\n");
+                        break;
+                }
+                case Opt_errors:
+                {
+                        char *errors = args[0].from;
+                        if (!errors || !*errors)
+                                goto cleanup;
+                        if (!strcmp(errors, "continue")) {
+                                *flag &= ~JFS_ERR_REMOUNT_RO;
+                                *flag &= ~JFS_ERR_PANIC;
+                                *flag |= JFS_ERR_CONTINUE;
+                        } else if (!strcmp(errors, "remount-ro")) {
+                                *flag &= ~JFS_ERR_CONTINUE;
+                                *flag &= ~JFS_ERR_PANIC;
+                                *flag |= JFS_ERR_REMOUNT_RO;
+                        } else if (!strcmp(errors, "panic")) {
+                                *flag &= ~JFS_ERR_CONTINUE;
+                                *flag &= ~JFS_ERR_REMOUNT_RO;
+                                *flag |= JFS_ERR_PANIC;
+                        } else {
+                                printk(KERN_ERR
+                                       "JFS: %s is an invalid error handler\n",
+                                       errors);
+                                goto cleanup;
+                        }
+                        break;
+                }
+                default:
+                        printk("jfs: Unrecognized mount option \"%s\" "
+                                        " or missing value\n", p);
+                        goto cleanup;
+                }
+        }
+        if (nls_map != (void *) -1) {
+                /* Discard old (if remount) */
+                if (sbi->nls_tab)
+                        unload_nls(sbi->nls_tab);
+                sbi->nls_tab = nls_map;
+        }
+        return 1;
+cleanup:
+        if (nls_map && nls_map != (void *) -1)
+                unload_nls(nls_map);
+        return 0;
+}
+static int jfs_remount(struct super_block *sb, int *flags, char *data)
+{
+        s64 newLVSize = 0;
+        int rc = 0;
+        int flag = JFS_SBI(sb)->flag;
+        if (!parse_options(data, sb, &newLVSize, &flag)) {
+                return -EINVAL;
+        }
+        if (newLVSize) {
+                if (sb->s_flags & MS_RDONLY) {
+                        printk(KERN_ERR
+                  "JFS: resize requires volume to be mounted read-write\n");
+                        return -EROFS;
+                }
+                rc = jfs_extendfs(sb, newLVSize, 0);
+                if (rc)
+                        return rc;
+        }
+        if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
+                JFS_SBI(sb)->flag = flag;
+                return jfs_mount_rw(sb, 1);
+        }
+        if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) {
+                rc = jfs_umount_rw(sb);
+                JFS_SBI(sb)->flag = flag;
+                return rc;
+        }
+        if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY))
+                if (!(sb->s_flags & MS_RDONLY)) {
+                        rc = jfs_umount_rw(sb);
+                        if (rc)
+                                return rc;
+                        JFS_SBI(sb)->flag = flag;
+                        return jfs_mount_rw(sb, 1);
+                }
+        JFS_SBI(sb)->flag = flag;
+        return 0;
+}
+static int jfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+        struct jfs_sb_info *sbi;
+        struct inode *inode;
+        int rc;
+        s64 newLVSize = 0;
+        int flag;
+        jfs_info("In jfs_read_super: s_flags=0x%lx", sb->s_flags);
+        if (!new_valid_dev(sb->s_bdev->bd_dev))
+                return -EOVERFLOW;
+        sbi = kmalloc(sizeof (struct jfs_sb_info), GFP_KERNEL);
+        if (!sbi)
+                return -ENOSPC;
+        memset(sbi, 0, sizeof (struct jfs_sb_info));
+        sb->s_fs_info = sbi;
+        sbi->sb = sb;
+        /* initialize the mount flag and determine the default error handler */
+        flag = JFS_ERR_REMOUNT_RO;
+        if (!parse_options((char *) data, sb, &newLVSize, &flag)) {
+                kfree(sbi);
+                return -EINVAL;
+        }
+        sbi->flag = flag;
+#ifdef CONFIG_JFS_POSIX_ACL
+        sb->s_flags |= MS_POSIXACL;
+#endif
+        if (newLVSize) {
+                printk(KERN_ERR "resize option for remount only\n");
+                return -EINVAL;
+        }
+        /*
+         * Initialize blocksize to 4K.
+         */
+        sb_set_blocksize(sb, PSIZE);
+        /*
+         * Set method vectors.
+         */
+        sb->s_op = &jfs_super_operations;
+        sb->s_export_op = &jfs_export_operations;
+        rc = jfs_mount(sb);
+        if (rc) {
+                if (!silent) {
+                        jfs_err("jfs_mount failed w/return code = %d", rc);
+                }
+                goto out_kfree;
+        }
+        if (sb->s_flags & MS_RDONLY)
+                sbi->log = NULL;
+        else {
+                rc = jfs_mount_rw(sb, 0);
+                if (rc) {
+                        if (!silent) {
+                                jfs_err("jfs_mount_rw failed, return code = %d",
+                                        rc);
+                        }
+                        goto out_no_rw;
+                }
+        }
+        sb->s_magic = JFS_SUPER_MAGIC;
+        inode = iget(sb, ROOT_I);
+        if (!inode || is_bad_inode(inode))
+                goto out_no_root;
+        sb->s_root = d_alloc_root(inode);
+        if (!sb->s_root)
+                goto out_no_root;
+        if (sbi->mntflag & JFS_OS2)
+                sb->s_root->d_op = &jfs_ci_dentry_operations;
+        /* logical blocks are represented by 40 bits in pxd_t, etc. */
+        sb->s_maxbytes = ((u64) sb->s_blocksize) << 40;
+#if BITS_PER_LONG == 32
+        /*
+         * Page cache is indexed by long.
+         * I would use MAX_LFS_FILESIZE, but it's only half as big
+         */
+        sb->s_maxbytes = min(((u64) PAGE_CACHE_SIZE << 32) - 1, sb->s_maxbytes);
+#endif
+        sb->s_time_gran = 1;
+        return 0;
+out_no_root:
+        jfs_err("jfs_read_super: get root inode failed");
+        if (inode)
+                iput(inode);
+out_no_rw:
+        rc = jfs_umount(sb);
+        if (rc) {
+                jfs_err("jfs_umount failed with return code %d", rc);
+        }
+out_kfree:
+        if (sbi->nls_tab)
+                unload_nls(sbi->nls_tab);
+        kfree(sbi);
+        return -EINVAL;
+}
+static void jfs_write_super_lockfs(struct super_block *sb)
+{
+        struct jfs_sb_info *sbi = JFS_SBI(sb);
+        struct jfs_log *log = sbi->log;
+        if (!(sb->s_flags & MS_RDONLY)) {
+                txQuiesce(sb);
+                lmLogShutdown(log);
+                updateSuper(sb, FM_CLEAN);
+        }
+}
+static void jfs_unlockfs(struct super_block *sb)
+{
+        struct jfs_sb_info *sbi = JFS_SBI(sb);
+        struct jfs_log *log = sbi->log;
+        int rc = 0;
+        if (!(sb->s_flags & MS_RDONLY)) {
+                updateSuper(sb, FM_MOUNT);
+                if ((rc = lmLogInit(log)))
+                        jfs_err("jfs_unlock failed with return code %d", rc);
+                else
+                        txResume(sb);
+        }
+}
+static struct super_block *jfs_get_sb(struct file_system_type *fs_type, 
+        int flags, const char *dev_name, void *data)
+{
+        return get_sb_bdev(fs_type, flags, dev_name, data, jfs_fill_super);
+}
+static int jfs_sync_fs(struct super_block *sb, int wait)
+{
+        struct jfs_log *log = JFS_SBI(sb)->log;
+        /* log == NULL indicates read-only mount */
+        if (log)
+                jfs_flush_journal(log, wait);
+        return 0;
+}
+static struct super_operations jfs_super_operations = {
+        .alloc_inode    = jfs_alloc_inode,
+        .destroy_inode  = jfs_destroy_inode,
+        .read_inode     = jfs_read_inode,
+        .dirty_inode    = jfs_dirty_inode,
+        .write_inode    = jfs_write_inode,
+        .delete_inode   = jfs_delete_inode,
+        .put_super      = jfs_put_super,
+        .sync_fs        = jfs_sync_fs,
+        .write_super_lockfs = jfs_write_super_lockfs,
+        .unlockfs       = jfs_unlockfs,
+        .statfs         = jfs_statfs,
+        .remount_fs     = jfs_remount,
+};
+static struct export_operations jfs_export_operations = {
+        .get_parent     = jfs_get_parent,
+};
+static struct file_system_type jfs_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "jfs",
+        .get_sb         = jfs_get_sb,
+        .kill_sb        = kill_block_super,
+        .fs_flags       = FS_REQUIRES_DEV,
+};
+extern int metapage_init(void);
+extern int txInit(void);
+extern void txExit(void);
+extern void metapage_exit(void);
+static void init_once(void *foo, kmem_cache_t * cachep, unsigned long flags)
+{
+        struct jfs_inode_info *jfs_ip = (struct jfs_inode_info *) foo;
+        if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
+            SLAB_CTOR_CONSTRUCTOR) {
+                memset(jfs_ip, 0, sizeof(struct jfs_inode_info));
+                INIT_LIST_HEAD(&jfs_ip->anon_inode_list);
+                init_rwsem(&jfs_ip->rdwrlock);
+                init_MUTEX(&jfs_ip->commit_sem);
+                init_rwsem(&jfs_ip->xattr_sem);
+                spin_lock_init(&jfs_ip->ag_lock);
+                jfs_ip->active_ag = -1;
+#ifdef CONFIG_JFS_POSIX_ACL
+                jfs_ip->i_acl = JFS_ACL_NOT_CACHED;
+                jfs_ip->i_default_acl = JFS_ACL_NOT_CACHED;
+#endif
+                inode_init_once(&jfs_ip->vfs_inode);
+        }
+}
+static int __init init_jfs_fs(void)
+{
+        int i;
+        int rc;
+        jfs_inode_cachep =
+            kmem_cache_create("jfs_ip", sizeof(struct jfs_inode_info), 0, 
+                            SLAB_RECLAIM_ACCOUNT, init_once, NULL);
+        if (jfs_inode_cachep == NULL)
+                return -ENOMEM;
+        /*
+         * Metapage initialization
+         */
+        rc = metapage_init();
+        if (rc) {
+                jfs_err("metapage_init failed w/rc = %d", rc);
+                goto free_slab;
+        }
+        /*
+         * Transaction Manager initialization
+         */
+        rc = txInit();
+        if (rc) {
+                jfs_err("txInit failed w/rc = %d", rc);
+                goto free_metapage;
+        }
+        /*
+         * I/O completion thread (endio)
+         */
+        jfsIOthread = kernel_thread(jfsIOWait, NULL, CLONE_KERNEL);
+        if (jfsIOthread < 0) {
+                jfs_err("init_jfs_fs: fork failed w/rc = %d", jfsIOthread);
+                goto end_txmngr;
+        }
+        wait_for_completion(&jfsIOwait);        /* Wait until thread starts */
+        if (commit_threads < 1)
+                commit_threads = num_online_cpus();
+        if (commit_threads > MAX_COMMIT_THREADS)
+                commit_threads = MAX_COMMIT_THREADS;
+        for (i = 0; i < commit_threads; i++) {
+                jfsCommitThread[i] = kernel_thread(jfs_lazycommit, NULL,
+                                                   CLONE_KERNEL);
+                if (jfsCommitThread[i] < 0) {
+                        jfs_err("init_jfs_fs: fork failed w/rc = %d",
+                                jfsCommitThread[i]);
+                        commit_threads = i;
+                        goto kill_committask;
+                }
+                /* Wait until thread starts */
+                wait_for_completion(&jfsIOwait);
+        }
+        jfsSyncThread = kernel_thread(jfs_sync, NULL, CLONE_KERNEL);
+        if (jfsSyncThread < 0) {
+                jfs_err("init_jfs_fs: fork failed w/rc = %d", jfsSyncThread);
+                goto kill_committask;
+        }
+        wait_for_completion(&jfsIOwait);        /* Wait until thread starts */
+#ifdef PROC_FS_JFS
+        jfs_proc_init();
+#endif
+        return register_filesystem(&jfs_fs_type);
+kill_committask:
+        jfs_stop_threads = 1;
+        wake_up_all(&jfs_commit_thread_wait);
+        for (i = 0; i < commit_threads; i++)
+                wait_for_completion(&jfsIOwait);
+        wake_up(&jfs_IO_thread_wait);
+        wait_for_completion(&jfsIOwait);        /* Wait for thread exit */
+end_txmngr:
+        txExit();
+free_metapage:
+        metapage_exit();
+free_slab:
+        kmem_cache_destroy(jfs_inode_cachep);
+        return rc;
+}
+static void __exit exit_jfs_fs(void)
+{
+        int i;
+        jfs_info("exit_jfs_fs called");
+        jfs_stop_threads = 1;
+        txExit();
+        metapage_exit();
+        wake_up(&jfs_IO_thread_wait);
+        wait_for_completion(&jfsIOwait);        /* Wait until IO thread exits */
+        wake_up_all(&jfs_commit_thread_wait);
+        for (i = 0; i < commit_threads; i++)
+                wait_for_completion(&jfsIOwait);
+        wake_up(&jfs_sync_thread_wait);
+        wait_for_completion(&jfsIOwait);        /* Wait until Sync thread exits */
+#ifdef PROC_FS_JFS
+        jfs_proc_clean();
+#endif
+        unregister_filesystem(&jfs_fs_type);
+        kmem_cache_destroy(jfs_inode_cachep);
+}
+module_init(init_jfs_fs)
+module_exit(exit_jfs_fs)
diff --git a/fs/jfs/symlink.c b/fs/jfs/symlink.c
new file mode 100644
index 000000000000..ef4c07ee92b2
--- /dev/null
+++ b/fs/jfs/symlink.c
@@ -0,0 +1,39 @@
+/*
+ *   Copyright (c) Christoph Hellwig, 2001-2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include "jfs_incore.h"
+#include "jfs_xattr.h"
+static int jfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+        char *s = JFS_IP(dentry->d_inode)->i_inline;
+        nd_set_link(nd, s);
+        return 0;
+}
+struct inode_operations jfs_symlink_inode_operations = {
+        .readlink       = generic_readlink,
+        .follow_link    = jfs_follow_link,
+        .setxattr       = jfs_setxattr,
+        .getxattr       = jfs_getxattr,
+        .listxattr      = jfs_listxattr,
+        .removexattr    = jfs_removexattr,
+};
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
new file mode 100644
index 000000000000..7a9ffd5d03dc
--- /dev/null
+++ b/fs/jfs/xattr.c
@@ -0,0 +1,1127 @@
+/*
+ *   Copyright (C) International Business Machines  Corp., 2000-2004
+ *   Copyright (C) Christoph Hellwig, 2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or 
+ *   (at your option) any later version.
+ * 
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/fs.h>
+#include <linux/xattr.h>
+#include <linux/quotaops.h>
+#include "jfs_incore.h"
+#include "jfs_superblock.h"
+#include "jfs_dmap.h"
+#include "jfs_debug.h"
+#include "jfs_dinode.h"
+#include "jfs_extent.h"
+#include "jfs_metapage.h"
+#include "jfs_xattr.h"
+#include "jfs_acl.h"
+/*
+ *      jfs_xattr.c: extended attribute service
+ *
+ * Overall design --
+ *
+ * Format:
+ *
+ *   Extended attribute lists (jfs_ea_list) consist of an overall size (32 bit
+ *   value) and a variable (0 or more) number of extended attribute
+ *   entries.  Each extended attribute entry (jfs_ea) is a <name,value> double
+ *   where <name> is constructed from a null-terminated ascii string
+ *   (1 ... 255 bytes in the name) and <value> is arbitrary 8 bit data
+ *   (1 ... 65535 bytes).  The in-memory format is
+ *
+ *   0       1        2        4                4 + namelen + 1
+ *   +-------+--------+--------+----------------+-------------------+
+ *   | Flags | Name   | Value  | Name String \0 | Data . . . .      |
+ *   |       | Length | Length |                |                   |
+ *   +-------+--------+--------+----------------+-------------------+
+ *
+ *   A jfs_ea_list then is structured as
+ *
+ *   0            4                   4 + EA_SIZE(ea1)
+ *   +------------+-------------------+--------------------+-----
+ *   | Overall EA | First FEA Element | Second FEA Element | ..... 
+ *   | List Size  |                   |                    |
+ *   +------------+-------------------+--------------------+-----
+ *
+ *   On-disk:
+ *
+ *     FEALISTs are stored on disk using blocks allocated by dbAlloc() and
+ *     written directly. An EA list may be in-lined in the inode if there is
+ *     sufficient room available.
+ */
+struct ea_buffer {
+        int flag;               /* Indicates what storage xattr points to */
+        int max_size;           /* largest xattr that fits in current buffer */
+        dxd_t new_ea;           /* dxd to replace ea when modifying xattr */
+        struct metapage *mp;    /* metapage containing ea list */
+        struct jfs_ea_list *xattr;      /* buffer containing ea list */
+};
+/*
+ * ea_buffer.flag values
+ */
+#define EA_INLINE       0x0001
+#define EA_EXTENT       0x0002
+#define EA_NEW          0x0004
+#define EA_MALLOC       0x0008
+/* Namespaces */
+#define XATTR_SYSTEM_PREFIX "system."
+#define XATTR_SYSTEM_PREFIX_LEN (sizeof (XATTR_SYSTEM_PREFIX) - 1)
+#define XATTR_USER_PREFIX "user."
+#define XATTR_USER_PREFIX_LEN (sizeof (XATTR_USER_PREFIX) - 1)
+#define XATTR_OS2_PREFIX "os2."
+#define XATTR_OS2_PREFIX_LEN (sizeof (XATTR_OS2_PREFIX) - 1)
+/* XATTR_SECURITY_PREFIX is defined in include/linux/xattr.h */
+#define XATTR_SECURITY_PREFIX_LEN (sizeof (XATTR_SECURITY_PREFIX) - 1)
+#define XATTR_TRUSTED_PREFIX "trusted."
+#define XATTR_TRUSTED_PREFIX_LEN (sizeof (XATTR_TRUSTED_PREFIX) - 1)
+/*
+ * These three routines are used to recognize on-disk extended attributes
+ * that are in a recognized namespace.  If the attribute is not recognized,
+ * "os2." is prepended to the name
+ */
+static inline int is_os2_xattr(struct jfs_ea *ea)
+{
+        /*
+         * Check for "system."
+         */
+        if ((ea->namelen >= XATTR_SYSTEM_PREFIX_LEN) &&
+            !strncmp(ea->name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+                return FALSE;
+        /*
+         * Check for "user."
+         */
+        if ((ea->namelen >= XATTR_USER_PREFIX_LEN) &&
+            !strncmp(ea->name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
+                return FALSE;
+        /*
+         * Check for "security."
+         */
+        if ((ea->namelen >= XATTR_SECURITY_PREFIX_LEN) &&
+            !strncmp(ea->name, XATTR_SECURITY_PREFIX,
+                     XATTR_SECURITY_PREFIX_LEN))
+                return FALSE;
+        /*
+         * Check for "trusted."
+         */
+        if ((ea->namelen >= XATTR_TRUSTED_PREFIX_LEN) &&
+            !strncmp(ea->name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN))
+                return FALSE;
+        /*
+         * Add any other valid namespace prefixes here
+         */
+        /*
+         * We assume it's OS/2's flat namespace
+         */
+        return TRUE;
+}
+static inline int name_size(struct jfs_ea *ea)
+{
+        if (is_os2_xattr(ea))
+                return ea->namelen + XATTR_OS2_PREFIX_LEN;
+        else
+                return ea->namelen;
+}
+static inline int copy_name(char *buffer, struct jfs_ea *ea)
+{
+        int len = ea->namelen;
+        if (is_os2_xattr(ea)) {
+                memcpy(buffer, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN);
+                buffer += XATTR_OS2_PREFIX_LEN;
+                len += XATTR_OS2_PREFIX_LEN;
+        }
+        memcpy(buffer, ea->name, ea->namelen);
+        buffer[ea->namelen] = 0;
+        return len;
+}
+/* Forward references */
+static void ea_release(struct inode *inode, struct ea_buffer *ea_buf);
+/*
+ * NAME: ea_write_inline
+ *                                                                    
+ * FUNCTION: Attempt to write an EA inline if area is available
+ *                                                                    
+ * PRE CONDITIONS:
+ *      Already verified that the specified EA is small enough to fit inline
+ *
+ * PARAMETERS:
+ *      ip      - Inode pointer
+ *      ealist  - EA list pointer
+ *      size    - size of ealist in bytes
+ *      ea      - dxd_t structure to be filled in with necessary EA information
+ *                if we successfully copy the EA inline
+ *
+ * NOTES:
+ *      Checks if the inode's inline area is available.  If so, copies EA inline
+ *      and sets <ea> fields appropriately.  Otherwise, returns failure, EA will
+ *      have to be put into an extent.
+ *
+ * RETURNS: 0 for successful copy to inline area; -1 if area not available
+ */
+static int ea_write_inline(struct inode *ip, struct jfs_ea_list *ealist,
+                           int size, dxd_t * ea)
+{
+        struct jfs_inode_info *ji = JFS_IP(ip);
+        /*
+         * Make sure we have an EA -- the NULL EA list is valid, but you
+         * can't copy it!
+         */
+        if (ealist && size > sizeof (struct jfs_ea_list)) {
+                assert(size <= sizeof (ji->i_inline_ea));
+                /*
+                 * See if the space is available or if it is already being
+                 * used for an inline EA.
+                 */
+                if (!(ji->mode2 & INLINEEA) && !(ji->ea.flag & DXD_INLINE))
+                        return -EPERM;
+                DXDsize(ea, size);
+                DXDlength(ea, 0);
+                DXDaddress(ea, 0);
+                memcpy(ji->i_inline_ea, ealist, size);
+                ea->flag = DXD_INLINE;
+                ji->mode2 &= ~INLINEEA;
+        } else {
+                ea->flag = 0;
+                DXDsize(ea, 0);
+                DXDlength(ea, 0);
+                DXDaddress(ea, 0);
+                /* Free up INLINE area */
+                if (ji->ea.flag & DXD_INLINE)
+                        ji->mode2 |= INLINEEA;
+        }
+        return 0;
+}
+/*
+ * NAME: ea_write
+ *                                                                    
+ * FUNCTION: Write an EA for an inode
+ *                                                                    
+ * PRE CONDITIONS: EA has been verified 
+ *
+ * PARAMETERS:
+ *      ip      - Inode pointer
+ *      ealist  - EA list pointer
+ *      size    - size of ealist in bytes
+ *      ea      - dxd_t structure to be filled in appropriately with where the
+ *                EA was copied
+ *
+ * NOTES: Will write EA inline if able to, otherwise allocates blocks for an
+ *      extent and synchronously writes it to those blocks.
+ *
+ * RETURNS: 0 for success; Anything else indicates failure
+ */
+static int ea_write(struct inode *ip, struct jfs_ea_list *ealist, int size,
+                       dxd_t * ea)
+{
+        struct super_block *sb = ip->i_sb;
+        struct jfs_inode_info *ji = JFS_IP(ip);
+        struct jfs_sb_info *sbi = JFS_SBI(sb);
+        int nblocks;
+        s64 blkno;
+        int rc = 0, i;
+        char *cp;
+        s32 nbytes, nb;
+        s32 bytes_to_write;
+        struct metapage *mp;
+        /*
+         * Quick check to see if this is an in-linable EA.  Short EAs
+         * and empty EAs are all in-linable, provided the space exists.
+         */
+        if (!ealist || size <= sizeof (ji->i_inline_ea)) {
+                if (!ea_write_inline(ip, ealist, size, ea))
+                        return 0;
+        }
+        /* figure out how many blocks we need */
+        nblocks = (size + (sb->s_blocksize - 1)) >> sb->s_blocksize_bits;
+        /* Allocate new blocks to quota. */
+        if (DQUOT_ALLOC_BLOCK(ip, nblocks)) {
+                return -EDQUOT;
+        }
+        rc = dbAlloc(ip, INOHINT(ip), nblocks, &blkno);
+        if (rc) {
+                /*Rollback quota allocation. */
+                DQUOT_FREE_BLOCK(ip, nblocks);
+                return rc;
+        }
+        /*
+         * Now have nblocks worth of storage to stuff into the FEALIST.
+         * loop over the FEALIST copying data into the buffer one page at
+         * a time.
+         */
+        cp = (char *) ealist;
+        nbytes = size;
+        for (i = 0; i < nblocks; i += sbi->nbperpage) {
+                /*
+                 * Determine how many bytes for this request, and round up to
+                 * the nearest aggregate block size
+                 */
+                nb = min(PSIZE, nbytes);
+                bytes_to_write =
+                    ((((nb + sb->s_blocksize - 1)) >> sb->s_blocksize_bits))
+                    << sb->s_blocksize_bits;
+                if (!(mp = get_metapage(ip, blkno + i, bytes_to_write, 1))) {
+                        rc = -EIO;
+                        goto failed;
+                }
+                memcpy(mp->data, cp, nb);
+                /*
+                 * We really need a way to propagate errors for
+                 * forced writes like this one.  --hch
+                 *
+                 * (__write_metapage => release_metapage => flush_metapage)
+                 */
+#ifdef _JFS_FIXME
+                if ((rc = flush_metapage(mp))) {
+                        /*
+                         * the write failed -- this means that the buffer
+                         * is still assigned and the blocks are not being
+                         * used.  this seems like the best error recovery
+                         * we can get ...
+                         */
+                        goto failed;
+                }
+#else
+                flush_metapage(mp);
+#endif
+                cp += PSIZE;
+                nbytes -= nb;
+        }
+        ea->flag = DXD_EXTENT;
+        DXDsize(ea, le32_to_cpu(ealist->size));
+        DXDlength(ea, nblocks);
+        DXDaddress(ea, blkno);
+        /* Free up INLINE area */
+        if (ji->ea.flag & DXD_INLINE)
+                ji->mode2 |= INLINEEA;
+        return 0;
+      failed:
+        /* Rollback quota allocation. */
+        DQUOT_FREE_BLOCK(ip, nblocks);
+        dbFree(ip, blkno, nblocks);
+        return rc;
+}
+/*
+ * NAME: ea_read_inline
+ *                                                                    
+ * FUNCTION: Read an inlined EA into user's buffer
+ *                                                                    
+ * PARAMETERS:
+ *      ip      - Inode pointer
+ *      ealist  - Pointer to buffer to fill in with EA
+ *
+ * RETURNS: 0
+ */
+static int ea_read_inline(struct inode *ip, struct jfs_ea_list *ealist)
+{
+        struct jfs_inode_info *ji = JFS_IP(ip);
+        int ea_size = sizeDXD(&ji->ea);
+        if (ea_size == 0) {
+                ealist->size = 0;
+                return 0;
+        }
+        /* Sanity Check */
+        if ((sizeDXD(&ji->ea) > sizeof (ji->i_inline_ea)))
+                return -EIO;
+        if (le32_to_cpu(((struct jfs_ea_list *) &ji->i_inline_ea)->size)
+            != ea_size)
+                return -EIO;
+        memcpy(ealist, ji->i_inline_ea, ea_size);
+        return 0;
+}
+/*
+ * NAME: ea_read
+ *                                                                    
+ * FUNCTION: copy EA data into user's buffer
+ *                                                                    
+ * PARAMETERS:
+ *      ip      - Inode pointer
+ *      ealist  - Pointer to buffer to fill in with EA
+ *
+ * NOTES:  If EA is inline calls ea_read_inline() to copy EA.
+ *
+ * RETURNS: 0 for success; other indicates failure
+ */
+static int ea_read(struct inode *ip, struct jfs_ea_list *ealist)
+{
+        struct super_block *sb = ip->i_sb;
+        struct jfs_inode_info *ji = JFS_IP(ip);
+        struct jfs_sb_info *sbi = JFS_SBI(sb);
+        int nblocks;
+        s64 blkno;
+        char *cp = (char *) ealist;
+        int i;
+        int nbytes, nb;
+        s32 bytes_to_read;
+        struct metapage *mp;
+        /* quick check for in-line EA */
+        if (ji->ea.flag & DXD_INLINE)
+                return ea_read_inline(ip, ealist);
+        nbytes = sizeDXD(&ji->ea);
+        if (!nbytes) {
+                jfs_error(sb, "ea_read: nbytes is 0");
+                return -EIO;
+        }
+        /* 
+         * Figure out how many blocks were allocated when this EA list was
+         * originally written to disk.
+         */
+        nblocks = lengthDXD(&ji->ea) << sbi->l2nbperpage;
+        blkno = addressDXD(&ji->ea) << sbi->l2nbperpage;
+        /*
+         * I have found the disk blocks which were originally used to store
+         * the FEALIST.  now i loop over each contiguous block copying the
+         * data into the buffer.
+         */
+        for (i = 0; i < nblocks; i += sbi->nbperpage) {
+                /*
+                 * Determine how many bytes for this request, and round up to
+                 * the nearest aggregate block size
+                 */
+                nb = min(PSIZE, nbytes);
+                bytes_to_read =
+                    ((((nb + sb->s_blocksize - 1)) >> sb->s_blocksize_bits))
+                    << sb->s_blocksize_bits;
+                if (!(mp = read_metapage(ip, blkno + i, bytes_to_read, 1)))
+                        return -EIO;
+                memcpy(cp, mp->data, nb);
+                release_metapage(mp);
+                cp += PSIZE;
+                nbytes -= nb;
+        }
+        return 0;
+}
+/*
+ * NAME: ea_get
+ *                                                                    
+ * FUNCTION: Returns buffer containing existing extended attributes.
+ *           The size of the buffer will be the larger of the existing
+ *           attributes size, or min_size.
+ *
+ *           The buffer, which may be inlined in the inode or in the
+ *           page cache must be release by calling ea_release or ea_put
+ *                                                                    
+ * PARAMETERS:
+ *      inode   - Inode pointer
+ *      ea_buf  - Structure to be populated with ealist and its metadata
+ *      min_size- minimum size of buffer to be returned
+ *
+ * RETURNS: 0 for success; Other indicates failure
+ */
+static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size)
+{
+        struct jfs_inode_info *ji = JFS_IP(inode);
+        struct super_block *sb = inode->i_sb;
+        int size;
+        int ea_size = sizeDXD(&ji->ea);
+        int blocks_needed, current_blocks;
+        s64 blkno;
+        int rc;
+        int quota_allocation = 0;
+        /* When fsck.jfs clears a bad ea, it doesn't clear the size */
+        if (ji->ea.flag == 0)
+                ea_size = 0;
+        if (ea_size == 0) {
+                if (min_size == 0) {
+                        ea_buf->flag = 0;
+                        ea_buf->max_size = 0;
+                        ea_buf->xattr = NULL;
+                        return 0;
+                }
+                if ((min_size <= sizeof (ji->i_inline_ea)) &&
+                    (ji->mode2 & INLINEEA)) {
+                        ea_buf->flag = EA_INLINE | EA_NEW;
+                        ea_buf->max_size = sizeof (ji->i_inline_ea);
+                        ea_buf->xattr = (struct jfs_ea_list *) ji->i_inline_ea;
+                        DXDlength(&ea_buf->new_ea, 0);
+                        DXDaddress(&ea_buf->new_ea, 0);
+                        ea_buf->new_ea.flag = DXD_INLINE;
+                        DXDsize(&ea_buf->new_ea, min_size);
+                        return 0;
+                }
+                current_blocks = 0;
+        } else if (ji->ea.flag & DXD_INLINE) {
+                if (min_size <= sizeof (ji->i_inline_ea)) {
+                        ea_buf->flag = EA_INLINE;
+                        ea_buf->max_size = sizeof (ji->i_inline_ea);
+                        ea_buf->xattr = (struct jfs_ea_list *) ji->i_inline_ea;
+                        goto size_check;
+                }
+                current_blocks = 0;
+        } else {
+                if (!(ji->ea.flag & DXD_EXTENT)) {
+                        jfs_error(sb, "ea_get: invalid ea.flag)");
+                        return -EIO;
+                }
+                current_blocks = (ea_size + sb->s_blocksize - 1) >>
+                    sb->s_blocksize_bits;
+        }
+        size = max(min_size, ea_size);
+        if (size > PSIZE) {
+                /*
+                 * To keep the rest of the code simple.  Allocate a
+                 * contiguous buffer to work with
+                 */
+                ea_buf->xattr = kmalloc(size, GFP_KERNEL);
+                if (ea_buf->xattr == NULL)
+                        return -ENOMEM;
+                ea_buf->flag = EA_MALLOC;
+                ea_buf->max_size = (size + sb->s_blocksize - 1) &
+                    ~(sb->s_blocksize - 1);
+                if (ea_size == 0)
+                        return 0;
+                if ((rc = ea_read(inode, ea_buf->xattr))) {
+                        kfree(ea_buf->xattr);
+                        ea_buf->xattr = NULL;
+                        return rc;
+                }
+                goto size_check;
+        }
+        blocks_needed = (min_size + sb->s_blocksize - 1) >>
+            sb->s_blocksize_bits;
+        if (blocks_needed > current_blocks) {
+                /* Allocate new blocks to quota. */
+                if (DQUOT_ALLOC_BLOCK(inode, blocks_needed))
+                        return -EDQUOT;
+                quota_allocation = blocks_needed;
+                rc = dbAlloc(inode, INOHINT(inode), (s64) blocks_needed,
+                             &blkno);
+                if (rc)
+                        goto clean_up;
+                DXDlength(&ea_buf->new_ea, blocks_needed);
+                DXDaddress(&ea_buf->new_ea, blkno);
+                ea_buf->new_ea.flag = DXD_EXTENT;
+                DXDsize(&ea_buf->new_ea, min_size);
+                ea_buf->flag = EA_EXTENT | EA_NEW;
+                ea_buf->mp = get_metapage(inode, blkno,
+                                          blocks_needed << sb->s_blocksize_bits,
+                                          1);
+                if (ea_buf->mp == NULL) {
+                        dbFree(inode, blkno, (s64) blocks_needed);
+                        rc = -EIO;
+                        goto clean_up;
+                }
+                ea_buf->xattr = ea_buf->mp->data;
+                ea_buf->max_size = (min_size + sb->s_blocksize - 1) &
+                    ~(sb->s_blocksize - 1);
+                if (ea_size == 0)
+                        return 0;
+                if ((rc = ea_read(inode, ea_buf->xattr))) {
+                        discard_metapage(ea_buf->mp);
+                        dbFree(inode, blkno, (s64) blocks_needed);
+                        goto clean_up;
+                }
+                goto size_check;
+        }
+        ea_buf->flag = EA_EXTENT;
+        ea_buf->mp = read_metapage(inode, addressDXD(&ji->ea),
+                                   lengthDXD(&ji->ea) << sb->s_blocksize_bits,
+                                   1);
+        if (ea_buf->mp == NULL) {
+                rc = -EIO;
+                goto clean_up;
+        }
+        ea_buf->xattr = ea_buf->mp->data;
+        ea_buf->max_size = (ea_size + sb->s_blocksize - 1) &
+            ~(sb->s_blocksize - 1);
+      size_check:
+        if (EALIST_SIZE(ea_buf->xattr) != ea_size) {
+                printk(KERN_ERR "ea_get: invalid extended attribute\n");
+                dump_mem("xattr", ea_buf->xattr, ea_size);
+                ea_release(inode, ea_buf);
+                rc = -EIO;
+                goto clean_up;
+        }
+        return ea_size;
+      clean_up:
+        /* Rollback quota allocation */
+        if (quota_allocation)
+                DQUOT_FREE_BLOCK(inode, quota_allocation);
+        return (rc);
+}
+static void ea_release(struct inode *inode, struct ea_buffer *ea_buf)
+{
+        if (ea_buf->flag & EA_MALLOC)
+                kfree(ea_buf->xattr);
+        else if (ea_buf->flag & EA_EXTENT) {
+                assert(ea_buf->mp);
+                release_metapage(ea_buf->mp);
+                if (ea_buf->flag & EA_NEW)
+                        dbFree(inode, addressDXD(&ea_buf->new_ea),
+                               lengthDXD(&ea_buf->new_ea));
+        }
+}
+static int ea_put(struct inode *inode, struct ea_buffer *ea_buf, int new_size)
+{
+        struct jfs_inode_info *ji = JFS_IP(inode);
+        unsigned long old_blocks, new_blocks;
+        int rc = 0;
+        tid_t tid;
+        if (new_size == 0) {
+                ea_release(inode, ea_buf);
+                ea_buf = NULL;
+        } else if (ea_buf->flag & EA_INLINE) {
+                assert(new_size <= sizeof (ji->i_inline_ea));
+                ji->mode2 &= ~INLINEEA;
+                ea_buf->new_ea.flag = DXD_INLINE;
+                DXDsize(&ea_buf->new_ea, new_size);
+                DXDaddress(&ea_buf->new_ea, 0);
+                DXDlength(&ea_buf->new_ea, 0);
+        } else if (ea_buf->flag & EA_MALLOC) {
+                rc = ea_write(inode, ea_buf->xattr, new_size, &ea_buf->new_ea);
+                kfree(ea_buf->xattr);
+        } else if (ea_buf->flag & EA_NEW) {
+                /* We have already allocated a new dxd */
+                flush_metapage(ea_buf->mp);
+        } else {
+                /* ->xattr must point to original ea's metapage */
+                rc = ea_write(inode, ea_buf->xattr, new_size, &ea_buf->new_ea);
+                discard_metapage(ea_buf->mp);
+        }
+        if (rc)
+                return rc;
+        tid = txBegin(inode->i_sb, 0);
+        down(&ji->commit_sem);
+        old_blocks = new_blocks = 0;
+        if (ji->ea.flag & DXD_EXTENT) {
+                invalidate_dxd_metapages(inode, ji->ea);
+                old_blocks = lengthDXD(&ji->ea);
+        }
+        if (ea_buf) {
+                txEA(tid, inode, &ji->ea, &ea_buf->new_ea);
+                if (ea_buf->new_ea.flag & DXD_EXTENT) {
+                        new_blocks = lengthDXD(&ea_buf->new_ea);
+                        if (ji->ea.flag & DXD_INLINE)
+                                ji->mode2 |= INLINEEA;
+                }
+                ji->ea = ea_buf->new_ea;
+        } else {
+                txEA(tid, inode, &ji->ea, NULL);
+                if (ji->ea.flag & DXD_INLINE)
+                        ji->mode2 |= INLINEEA;
+                ji->ea.flag = 0;
+                ji->ea.size = 0;
+        }
+        /* If old blocks exist, they must be removed from quota allocation. */
+        if (old_blocks)
+                DQUOT_FREE_BLOCK(inode, old_blocks);
+        inode->i_ctime = CURRENT_TIME;
+        rc = txCommit(tid, 1, &inode, 0);
+        txEnd(tid);
+        up(&ji->commit_sem);
+        return rc;
+}
+/*
+ * can_set_system_xattr
+ *
+ * This code is specific to the system.* namespace.  It contains policy
+ * which doesn't belong in the main xattr codepath.
+ */
+static int can_set_system_xattr(struct inode *inode, const char *name,
+                                const void *value, size_t value_len)
+{
+#ifdef CONFIG_JFS_POSIX_ACL
+        struct posix_acl *acl;
+        int rc;
+        if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+                return -EPERM;
+        /*
+         * XATTR_NAME_ACL_ACCESS is tied to i_mode
+         */
+        if (strcmp(name, XATTR_NAME_ACL_ACCESS) == 0) {
+                acl = posix_acl_from_xattr(value, value_len);
+                if (IS_ERR(acl)) {
+                        rc = PTR_ERR(acl);
+                        printk(KERN_ERR "posix_acl_from_xattr returned %d\n",
+                               rc);
+                        return rc;
+                }
+                if (acl) {
+                        mode_t mode = inode->i_mode;
+                        rc = posix_acl_equiv_mode(acl, &mode);
+                        posix_acl_release(acl);
+                        if (rc < 0) {
+                                printk(KERN_ERR
+                                       "posix_acl_equiv_mode returned %d\n",
+                                       rc);
+                                return rc;
+                        }
+                        inode->i_mode = mode;
+                        mark_inode_dirty(inode);
+                }
+                /*
+                 * We're changing the ACL.  Get rid of the cached one
+                 */
+                acl =JFS_IP(inode)->i_acl;
+                if (acl != JFS_ACL_NOT_CACHED)
+                        posix_acl_release(acl);
+                JFS_IP(inode)->i_acl = JFS_ACL_NOT_CACHED;
+                return 0;
+        } else if (strcmp(name, XATTR_NAME_ACL_DEFAULT) == 0) {
+                acl = posix_acl_from_xattr(value, value_len);
+                if (IS_ERR(acl)) {
+                        rc = PTR_ERR(acl);
+                        printk(KERN_ERR "posix_acl_from_xattr returned %d\n",
+                               rc);
+                        return rc;
+                }
+                posix_acl_release(acl);
+                /*
+                 * We're changing the default ACL.  Get rid of the cached one
+                 */
+                acl =JFS_IP(inode)->i_default_acl;
+                if (acl && (acl != JFS_ACL_NOT_CACHED))
+                        posix_acl_release(acl);
+                JFS_IP(inode)->i_default_acl = JFS_ACL_NOT_CACHED;
+                return 0;
+        }
+#endif                  /* CONFIG_JFS_POSIX_ACL */
+        return -EOPNOTSUPP;
+}
+static int can_set_xattr(struct inode *inode, const char *name,
+                         const void *value, size_t value_len)
+{
+        if (IS_RDONLY(inode))
+                return -EROFS;
+        if (IS_IMMUTABLE(inode) || IS_APPEND(inode) || S_ISLNK(inode->i_mode))
+                return -EPERM;
+        if(strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) == 0)
+                /*
+                 * "system.*"
+                 */
+                return can_set_system_xattr(inode, name, value, value_len);
+        if(strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) != 0)
+                return (capable(CAP_SYS_ADMIN) ? 0 : -EPERM);
+#ifdef CONFIG_JFS_SECURITY
+        if (strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)
+            != 0)
+                return 0;       /* Leave it to the security module */
+#endif
+                
+        if((strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) != 0) &&
+           (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) != 0))
+                return -EOPNOTSUPP;
+        if (!S_ISREG(inode->i_mode) &&
+            (!S_ISDIR(inode->i_mode) || inode->i_mode &S_ISVTX))
+                return -EPERM;
+        return permission(inode, MAY_WRITE, NULL);
+}
+int __jfs_setxattr(struct inode *inode, const char *name, const void *value,
+                   size_t value_len, int flags)
+{
+        struct jfs_ea_list *ealist;
+        struct jfs_ea *ea, *old_ea = NULL, *next_ea = NULL;
+        struct ea_buffer ea_buf;
+        int old_ea_size = 0;
+        int xattr_size;
+        int new_size;
+        int namelen = strlen(name);
+        char *os2name = NULL;
+        int found = 0;
+        int rc;
+        int length;
+        if ((rc = can_set_xattr(inode, name, value, value_len)))
+                return rc;
+        if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) {
+                os2name = kmalloc(namelen - XATTR_OS2_PREFIX_LEN + 1,
+                                  GFP_KERNEL);
+                if (!os2name)
+                        return -ENOMEM;
+                strcpy(os2name, name + XATTR_OS2_PREFIX_LEN);
+                name = os2name;
+                namelen -= XATTR_OS2_PREFIX_LEN;
+        }
+        down_write(&JFS_IP(inode)->xattr_sem);
+        xattr_size = ea_get(inode, &ea_buf, 0);
+        if (xattr_size < 0) {
+                rc = xattr_size;
+                goto out;
+        }
+      again:
+        ealist = (struct jfs_ea_list *) ea_buf.xattr;
+        new_size = sizeof (struct jfs_ea_list);
+        if (xattr_size) {
+                for (ea = FIRST_EA(ealist); ea < END_EALIST(ealist);
+                     ea = NEXT_EA(ea)) {
+                        if ((namelen == ea->namelen) &&
+                            (memcmp(name, ea->name, namelen) == 0)) {
+                                found = 1;
+                                if (flags & XATTR_CREATE) {
+                                        rc = -EEXIST;
+                                        goto release;
+                                }
+                                old_ea = ea;
+                                old_ea_size = EA_SIZE(ea);
+                                next_ea = NEXT_EA(ea);
+                        } else
+                                new_size += EA_SIZE(ea);
+                }
+        }
+        if (!found) {
+                if (flags & XATTR_REPLACE) {
+                        rc = -ENODATA;
+                        goto release;
+                }
+                if (value == NULL) {
+                        rc = 0;
+                        goto release;
+                }
+        }
+        if (value)
+                new_size += sizeof (struct jfs_ea) + namelen + 1 + value_len;
+        if (new_size > ea_buf.max_size) {
+                /*
+                 * We need to allocate more space for merged ea list.
+                 * We should only have loop to again: once.
+                 */
+                ea_release(inode, &ea_buf);
+                xattr_size = ea_get(inode, &ea_buf, new_size);
+                if (xattr_size < 0) {
+                        rc = xattr_size;
+                        goto out;
+                }
+                goto again;
+        }
+        /* Remove old ea of the same name */
+        if (found) {
+                /* number of bytes following target EA */
+                length = (char *) END_EALIST(ealist) - (char *) next_ea;
+                if (length > 0)
+                        memmove(old_ea, next_ea, length);
+                xattr_size -= old_ea_size;
+        }
+        /* Add new entry to the end */
+        if (value) {
+                if (xattr_size == 0)
+                        /* Completely new ea list */
+                        xattr_size = sizeof (struct jfs_ea_list);
+                ea = (struct jfs_ea *) ((char *) ealist + xattr_size);
+                ea->flag = 0;
+                ea->namelen = namelen;
+                ea->valuelen = (cpu_to_le16(value_len));
+                memcpy(ea->name, name, namelen);
+                ea->name[namelen] = 0;
+                if (value_len)
+                        memcpy(&ea->name[namelen + 1], value, value_len);
+                xattr_size += EA_SIZE(ea);
+        }
+        /* DEBUG - If we did this right, these number match */
+        if (xattr_size != new_size) {
+                printk(KERN_ERR
+                       "jfs_xsetattr: xattr_size = %d, new_size = %d\n",
+                       xattr_size, new_size);
+                rc = -EINVAL;
+                goto release;
+        }
+        /*
+         * If we're left with an empty list, there's no ea
+         */
+        if (new_size == sizeof (struct jfs_ea_list))
+                new_size = 0;
+        ealist->size = cpu_to_le32(new_size);
+        rc = ea_put(inode, &ea_buf, new_size);
+        goto out;
+      release:
+        ea_release(inode, &ea_buf);
+      out:
+        up_write(&JFS_IP(inode)->xattr_sem);
+        if (os2name)
+                kfree(os2name);
+        return rc;
+}
+int jfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+                 size_t value_len, int flags)
+{
+        if (value == NULL) {    /* empty EA, do not remove */
+                value = "";
+                value_len = 0;
+        }
+        return __jfs_setxattr(dentry->d_inode, name, value, value_len, flags);
+}
+static int can_get_xattr(struct inode *inode, const char *name)
+{
+#ifdef CONFIG_JFS_SECURITY
+        if(strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0)
+                return 0;
+#endif
+        if(strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0)
+                return (capable(CAP_SYS_ADMIN) ? 0 : -EPERM);
+        if(strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) == 0)
+                return 0;
+        return permission(inode, MAY_READ, NULL);
+}
+ssize_t __jfs_getxattr(struct inode *inode, const char *name, void *data,
+                       size_t buf_size)
+{
+        struct jfs_ea_list *ealist;
+        struct jfs_ea *ea;
+        struct ea_buffer ea_buf;
+        int xattr_size;
+        ssize_t size;
+        int namelen = strlen(name);
+        char *os2name = NULL;
+        int rc;
+        char *value;
+        if ((rc = can_get_xattr(inode, name)))
+                return rc;
+        if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) {
+                os2name = kmalloc(namelen - XATTR_OS2_PREFIX_LEN + 1,
+                                  GFP_KERNEL);
+                if (!os2name)
+                        return -ENOMEM;
+                strcpy(os2name, name + XATTR_OS2_PREFIX_LEN);
+                name = os2name;
+                namelen -= XATTR_OS2_PREFIX_LEN;
+        }
+        down_read(&JFS_IP(inode)->xattr_sem);
+        xattr_size = ea_get(inode, &ea_buf, 0);
+        if (xattr_size < 0) {
+                size = xattr_size;
+                goto out;
+        }
+        if (xattr_size == 0)
+                goto not_found;
+        ealist = (struct jfs_ea_list *) ea_buf.xattr;
+        /* Find the named attribute */
+        for (ea = FIRST_EA(ealist); ea < END_EALIST(ealist); ea = NEXT_EA(ea))
+                if ((namelen == ea->namelen) &&
+                    memcmp(name, ea->name, namelen) == 0) {
+                        /* Found it */
+                        size = le16_to_cpu(ea->valuelen);
+                        if (!data)
+                                goto release;
+                        else if (size > buf_size) {
+                                size = -ERANGE;
+                                goto release;
+                        }
+                        value = ((char *) &ea->name) + ea->namelen + 1;
+                        memcpy(data, value, size);
+                        goto release;
+                }
+      not_found:
+        size = -ENODATA;
+      release:
+        ea_release(inode, &ea_buf);
+      out:
+        up_read(&JFS_IP(inode)->xattr_sem);
+        if (os2name)
+                kfree(os2name);
+        return size;
+}
+ssize_t jfs_getxattr(struct dentry *dentry, const char *name, void *data,
+                     size_t buf_size)
+{
+        int err;
+        err = __jfs_getxattr(dentry->d_inode, name, data, buf_size);
+        return err;
+}
+/*
+ * No special permissions are needed to list attributes except for trusted.*
+ */
+static inline int can_list(struct jfs_ea *ea)
+{
+        return (strncmp(ea->name, XATTR_TRUSTED_PREFIX,
+                            XATTR_TRUSTED_PREFIX_LEN) ||
+                capable(CAP_SYS_ADMIN));
+}
+ssize_t jfs_listxattr(struct dentry * dentry, char *data, size_t buf_size)
+{
+        struct inode *inode = dentry->d_inode;
+        char *buffer;
+        ssize_t size = 0;
+        int xattr_size;
+        struct jfs_ea_list *ealist;
+        struct jfs_ea *ea;
+        struct ea_buffer ea_buf;
+        down_read(&JFS_IP(inode)->xattr_sem);
+        xattr_size = ea_get(inode, &ea_buf, 0);
+        if (xattr_size < 0) {
+                size = xattr_size;
+                goto out;
+        }
+        if (xattr_size == 0)
+                goto release;
+        ealist = (struct jfs_ea_list *) ea_buf.xattr;
+        /* compute required size of list */
+        for (ea = FIRST_EA(ealist); ea < END_EALIST(ealist); ea = NEXT_EA(ea)) {
+                if (can_list(ea))
+                        size += name_size(ea) + 1;
+        }
+        if (!data)
+                goto release;
+        if (size > buf_size) {
+                size = -ERANGE;
+                goto release;
+        }
+        /* Copy attribute names to buffer */
+        buffer = data;
+        for (ea = FIRST_EA(ealist); ea < END_EALIST(ealist); ea = NEXT_EA(ea)) {
+                if (can_list(ea)) {
+                        int namelen = copy_name(buffer, ea);
+                        buffer += namelen + 1;
+                }
+        }
+      release:
+        ea_release(inode, &ea_buf);
+      out:
+        up_read(&JFS_IP(inode)->xattr_sem);
+        return size;
+}
+int jfs_removexattr(struct dentry *dentry, const char *name)
+{
+        return __jfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
+}
author	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 18:20:36 -0400
committer	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 18:20:36 -0400
commit	1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree	0bba044c4ce775e45a88a51686b5d9f90697ea9d /fs/jfs